Python fp and docs updtes (#58)

* refactor: Update comments for clarity and add expectations.json files for performance metrics

* feat: Implement FP guard for JS/TS local-collection receivers to suppress missing ownership checks

* feat: Enhance Rust parameter handling to classify local collections and prevent false ownership checks

* refactor: Simplify code formatting for better readability in multiple files

* refactor: Improve UTF-8 sequence length handling and enhance clarity in loop iteration

* feat: Update Java and Python patterns to include new security rules

* refactor: Improve comment clarity and consistency across multiple Rust files

* refactor: Simplify code formatting for improved readability in integration tests and module files

* refactor: Improve comment formatting and enhance clarity in assertions across multiple files
This commit is contained in:
Eli Peter 2026-04-29 19:53:34 -04:00 committed by GitHub
parent 4db0805de6
commit a438886217
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
291 changed files with 9485 additions and 3851 deletions

View file

@ -3,7 +3,7 @@
//! The forward taint engine (`ssa_transfer.rs`) proceeds source-to-sink,
//! spending analysis budget on every function the source might touch. Its
//! precision ceiling is fixed by what summaries + inline re-analysis can
//! preserve on every edge of a flow a single lossy edge drops the finding.
//! preserve on every edge of a flow, a single lossy edge drops the finding.
//!
//! This module implements the opposite direction: start at each sink value,
//! walk *reverse* SSA edges and (when needed) cross-file callee bodies on
@ -16,7 +16,7 @@
//! reaches a matching source, we append `backwards-confirmed` to the
//! finding's evidence notes.
//! * When the backwards walk proves the flow infeasible via accumulated
//! path predicates, we append `backwards-infeasible` consumed by the
//! path predicates, we append `backwards-infeasible`, consumed by the
//! confidence scorer as a cap-to-Low signal.
//! * Backward flows that reach a source with no matching forward finding
//! become standalone `taint-backwards-flow` diags (a separate rule id so
@ -63,7 +63,7 @@ pub const MAX_BACKWARDS_CALLEE_BLOCKS: usize = 500;
/// the finding, and which predicate evidence (if any) has been gathered so
/// far.
///
/// `caps` is monotone the walk can only narrow the demand (by proving
/// `caps` is monotone, the walk can only narrow the demand (by proving
/// operands validated or sanitized against specific capability bits), never
/// widen it. This keeps backwards composition with summary-derived
/// transforms sound.
@ -140,7 +140,7 @@ pub const MAX_CHAIN_LEN: usize = 16;
/// The context is intentionally narrow: it borrows from whatever analysis
/// objects the caller has already prepared (summaries, the current body,
/// cross-file body maps) and does not build its own. This keeps the
/// backwards pass cheap to enable when off, none of this code is touched.
/// backwards pass cheap to enable, when off, none of this code is touched.
pub struct BackwardsCtx<'a> {
/// Callee's SSA body.
pub ssa: &'a SsaBody,
@ -178,7 +178,7 @@ impl<'a> BackwardsCtx<'a> {
/// One step of the backwards transfer: given a demand on `value`, compute
/// the demand on its immediate SSA operands. Returns the list of
/// `(operand, demand)` pairs possibly empty if the defining op terminates
/// `(operand, demand)` pairs, possibly empty if the defining op terminates
/// the walk (Source/Const/Param).
///
/// This is a pure function over the op and demand; cycle detection and
@ -224,7 +224,7 @@ pub fn backward_transfer(
SsaOp::CatchParam => (BackwardStep::ReachedCatchParam, SmallVec::new()),
SsaOp::Nop => (BackwardStep::Unknown, SmallVec::new()),
// Undef is a phi-operand sentinel on edges with no reaching
// definition nothing to trace backwards through.
// definition, nothing to trace backwards through.
SsaOp::Undef => (BackwardStep::ReachedConst, SmallVec::new()),
SsaOp::Phi(operands) => {
// Demand fans out to every incoming value: the runtime value of
@ -254,7 +254,7 @@ pub fn backward_transfer(
..
} => {
// For Call ops the full demand transfer depends on callee
// metadata (summary or body). The driver handles that
// metadata (summary or body). The driver handles that ,
// return a `BackwardStep::Call` carrying the receiver + args
// so the driver can consult [`GlobalSummaries`] / bodies_by_key.
let mut flat: SmallVec<[(SsaValue, DemandState); 4]> = SmallVec::new();
@ -276,7 +276,7 @@ pub fn backward_transfer(
SsaOp::FieldProj { receiver, .. } => {
// Field projection: demand for `obj.f` flows to `obj`. Treated
// structurally like a single-operand Assign for the backwards
// walk — sufficient until Phase 4 introduces field-sensitive
// walk, sufficient until future passes will introduce field-sensitive
// demand discrimination.
let mut next: SmallVec<[(SsaValue, DemandState); 4]> = SmallVec::new();
next.push((*receiver, demand.clone()));
@ -290,12 +290,12 @@ pub fn backward_transfer(
/// resolution.
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum BackwardStep {
/// Defining op is a tainted [`SsaOp::Source`] walk terminates with a
/// Defining op is a tainted [`SsaOp::Source`], walk terminates with a
/// confirmed flow.
ReachedSource(NodeIndex),
/// Defining op is a [`SsaOp::Const`] walk terminates without a source.
/// Defining op is a [`SsaOp::Const`], walk terminates without a source.
ReachedConst,
/// Defining op is an [`SsaOp::Param`] / [`SsaOp::SelfParam`] walk may
/// Defining op is an [`SsaOp::Param`] / [`SsaOp::SelfParam`], walk may
/// continue by resolving the parameter against the caller's arguments
/// (requires reverse call-graph expansion, which is out of scope for
/// the current cut and is handled as a terminal step).
@ -305,13 +305,13 @@ pub enum BackwardStep {
/// the actual exception source requires exception-edge traversal not
/// performed here.
ReachedCatchParam,
/// Phi node driver fans out to predecessors.
/// Phi node, driver fans out to predecessors.
Phi,
/// Arithmetic / copy / cast driver fans out to operands.
/// Arithmetic / copy / cast, driver fans out to operands.
Assign,
/// Call op driver consults summaries and/or callee bodies.
/// Call op, driver consults summaries and/or callee bodies.
Call { callee: String },
/// Defining op could not be located or was a [`SsaOp::Nop`] walk
/// Defining op could not be located or was a [`SsaOp::Nop`], walk
/// terminates as inconclusive.
Unknown,
}
@ -321,7 +321,7 @@ pub enum BackwardStep {
/// Walk backwards from `sink_value` in `ctx.ssa`, producing at most one
/// [`BackwardFlow`] per reached source (phi fan-outs can produce multiple).
///
/// Does not consult forward findings the caller is responsible for
/// Does not consult forward findings, the caller is responsible for
/// matching the returned flows against its finding set.
pub fn analyse_sink_backwards(
ctx: &BackwardsCtx<'_>,
@ -385,7 +385,7 @@ fn walk_dfs(
// Before dispatching on the SSA op kind, consult the defining CFG node's
// label set. Many Source-labelled callables in the CFG lower to an
// `SsaOp::Call` rather than `SsaOp::Source` (request.args.get,
// os.getenv, …) recognising the label here keeps the walk in
// os.getenv, …), recognising the label here keeps the walk in
// sync with the forward engine's source model.
let def_cfg_node = ctx.ssa.def_of(value).cfg_node;
if def_cfg_node.index() < ctx.cfg.node_count() {
@ -429,7 +429,7 @@ fn walk_dfs(
});
}
BackwardStep::ReachedConst => {
// Constants never supply taint treat as a silent prune.
// Constants never supply taint, treat as a silent prune.
}
BackwardStep::ReachedParam { index: _, node } => {
// Reverse-call-graph expansion is intentionally left out of the
@ -452,7 +452,7 @@ fn walk_dfs(
});
}
BackwardStep::ReachedCatchParam => {
// Exception-borne taint record but don't confirm. Marked
// Exception-borne taint, record but don't confirm. Marked
// non-confirmatory so unit tests can distinguish "walk reached
// catch-param" from "walk reached source".
}
@ -514,7 +514,7 @@ fn walk_dfs(
}
}
// Prevent an unused-variable warning while still accepting
// the key in the matcher the key is useful for debug
// the key in the matcher, the key is useful for debug
// logging in bigger expansions.
let _ = callee_key;
return;
@ -539,7 +539,7 @@ fn walk_dfs(
}
}
BackwardStep::Unknown => {
// No information terminate silently.
// No information, terminate silently.
}
}
}
@ -632,12 +632,12 @@ pub const NOTE_BUDGET: &str = "backwards-budget-exhausted";
/// Classification for a forward finding after backwards post-processing.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum FindingVerdict {
/// Backwards reached a matching source finding corroborated.
/// Backwards reached a matching source, finding corroborated.
Confirmed,
/// Backwards was inconclusive (no source, not infeasible). Finding
/// keeps its forward-assigned confidence.
Inconclusive,
/// Backwards proved the flow infeasible finding confidence must drop.
/// Backwards proved the flow infeasible, finding confidence must drop.
Infeasible,
/// Budget exhausted before a verdict was reached.
BudgetExhausted,
@ -658,7 +658,7 @@ pub fn aggregate_verdict(flows: &[BackwardFlow]) -> FindingVerdict {
}
/// Apply a verdict as a note on a [`Finding`]. No-ops when the verdict is
/// [`FindingVerdict::Inconclusive`] the forward finding retains its
/// [`FindingVerdict::Inconclusive`], the forward finding retains its
/// original metadata.
pub fn annotate_finding(finding: &mut Finding, verdict: FindingVerdict) {
// `Finding` does not own an Evidence struct directly (that lives on
@ -1079,6 +1079,7 @@ mod tests {
path_hash: 0,
finding_id: String::new(),
alternative_finding_ids: smallvec::SmallVec::new(),
effective_sink_caps: crate::labels::Cap::empty(),
};
annotate_finding(&mut f, FindingVerdict::Confirmed);
let sv = f.symbolic.as_ref().expect("symbolic verdict created");
@ -1116,6 +1117,7 @@ mod tests {
path_hash: 0,
finding_id: String::new(),
alternative_finding_ids: smallvec::SmallVec::new(),
effective_sink_caps: crate::labels::Cap::empty(),
};
annotate_finding(&mut f, FindingVerdict::Inconclusive);
assert!(f.symbolic.is_none());

View file

@ -13,7 +13,7 @@ pub struct VarTaint {
pub uses_summary: bool,
}
/// A single taint origin the node and classification of where taint came from.
/// A single taint origin, the node and classification of where taint came from.
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub struct TaintOrigin {
pub node: NodeIndex,
@ -30,7 +30,7 @@ pub struct TaintOrigin {
/// # Capacity limit
///
/// `SmallBitSet` is a fixed-size 64-slot bitset backed by a single `u64`.
/// Inserting a `SymbolId` with ordinal ≥ 64 is a no-op the bit is silently
/// Inserting a `SymbolId` with ordinal ≥ 64 is a no-op, the bit is silently
/// dropped. This is a deliberate precision-over-completeness trade: the
/// bitset underpins predicate / validation tracking in the SSA taint engine,
/// and functions with more than 64 distinct predicate-relevant variables are

View file

@ -1,4 +1,5 @@
#![allow(clippy::collapsible_if, clippy::too_many_arguments)]
#![doc = include_str!(concat!(env!("OUT_DIR"), "/taint.md"))]
pub mod backwards;
pub mod domain;
@ -84,7 +85,7 @@ fn js_ts_pass2_cap() -> usize {
//
// Active only when the slot is `Some`. Production code path leaves it
// `None`, making instrumentation cost a single thread-local borrow + a
// `match Option::None` per measured chunk sub-nanosecond.
// `match Option::None` per measured chunk, sub-nanosecond.
thread_local! {
static PERF_LOWER_TIMINGS: std::cell::Cell<Option<[u128; 7]>> =
const { std::cell::Cell::new(None) };
@ -112,10 +113,10 @@ fn perf_lower_record(slot: usize, micros: u128) {
/// Test-only override for the Gauss-Seidel toggle. Values:
///
/// * `0` respect `NYX_JS_GAUSS_SEIDEL` env var (default production
/// * `0`, respect `NYX_JS_GAUSS_SEIDEL` env var (default production
/// behaviour).
/// * `1` force Jacobi (env ignored).
/// * `2` force Gauss-Seidel (env ignored).
/// * `1`, force Jacobi (env ignored).
/// * `2`, force Gauss-Seidel (env ignored).
///
/// Used exclusively by integration tests that need to assert both
/// variants produce equal findings without per-test process isolation.
@ -209,7 +210,7 @@ pub struct Finding {
/// The kind of source that originated the taint.
pub source_kind: SourceKind,
/// Whether all tainted sink variables are guarded by a validation
/// predicate on this path (metadata only does not change severity).
/// predicate on this path (metadata only, does not change severity).
pub path_validated: bool,
/// The kind of validation guard protecting this path, if any.
pub guard_kind: Option<PredicateKind>,
@ -233,7 +234,7 @@ pub struct Finding {
/// sink was resolved via a function summary carrying a
/// [`crate::summary::SinkSite`] with concrete coordinates for primary
/// sink-location attribution. `None` for:
/// * intra-procedural / label-based sinks the caller's `cfg[sink]`
/// * intra-procedural / label-based sinks, the caller's `cfg[sink]`
/// span already names the dangerous instruction;
/// * summary-resolved sinks whose `SinkSite` was cap-only (no tree or
/// bytes context at extraction time).
@ -245,7 +246,7 @@ pub struct Finding {
/// the scan root is the file itself (every namespace normalizes to
/// `""`); consumers resolve empty `file_rel` against the file under
/// analysis. Enforced at `ssa_events_to_findings` by a
/// `debug_assert!` upstream filters drop cap-only sites before
/// `debug_assert!`, upstream filters drop cap-only sites before
/// they reach this field.
///
/// Deliberately independent of `uses_summary`: that flag tracks whether
@ -255,13 +256,13 @@ pub struct Finding {
/// `primary_location`.
pub primary_location: Option<SinkLocation>,
/// Engine provenance notes recorded during the analysis that produced
/// this finding. Populated when an internal budget/cap was hit see
/// this finding. Populated when an internal budget/cap was hit, see
/// [`crate::engine_notes::EngineNote`]. Empty for the typical
/// under-budget finding.
pub engine_notes: SmallVec<[EngineNote; 2]>,
/// Stable hash of the intermediate-variable sequence between `source`
/// and `sink`. Used to keep distinct paths through different
/// variables as separate findings during deduplication two
/// variables as separate findings during deduplication, two
/// `(body_id, sink, source)` siblings with different `path_hash`
/// values represent flows along different data paths and are
/// preserved as alternatives rather than collapsed.
@ -289,6 +290,13 @@ pub struct Finding {
/// formatters can present them as "this flow … and N alternative
/// path(s)" rather than silently dropping one.
pub alternative_finding_ids: SmallVec<[String; 2]>,
/// Sink-cap mask that this specific finding fired against. Carries the
/// per-event `sink_caps` from the multi-gate dispatch (e.g.
/// `Cap::SSRF` for a URL-flow finding on `fetch`, `Cap::DATA_EXFIL`
/// for a body-flow finding on the same call). Used by `ast.rs` to
/// route the finding to a cap-specific rule id rather than the
/// generic `taint-unsanitised-flow` bucket.
pub effective_sink_caps: crate::labels::Cap,
}
impl Finding {
@ -425,7 +433,7 @@ pub(crate) fn analyse_file_with_lowered(
// 3. Unified multi-body analysis with lexical containment propagation.
//
// `max_iterations` is the safety cap, not an expected depth the
// `max_iterations` is the safety cap, not an expected depth, the
// pass-2 loop breaks on seed equality (monotone lattice, finite
// height) and only rides the cap when convergence legitimately
// needs more rounds than the cap allows. See
@ -481,7 +489,7 @@ pub(crate) fn analyse_file_with_lowered(
// dedup_by_key(|f| (body_id, sink, source));
//
// which silently collapsed an *unguarded* flow reaching the same
// `(sink, source)` as a guarded flow the `!path_validated` sort
// `(sink, source)` as a guarded flow, the `!path_validated` sort
// ordered `path_validated == true` first, so the exploitable
// branch was the one that got dropped.
//
@ -541,7 +549,7 @@ fn make_finding_id(f: &Finding) -> String {
/// Cross-link findings that share `(body_id, sink, source)` but differ
/// on `path_validated` or `path_hash`. After this call each such
/// finding's `alternative_finding_ids` lists every sibling's
/// [`Finding::finding_id`] so a guarded flow links to the unguarded
/// [`Finding::finding_id`], so a guarded flow links to the unguarded
/// sibling and vice versa. Isolated findings (no sibling) get an
/// empty list.
fn link_alternative_paths(findings: &mut [Finding]) {
@ -576,7 +584,7 @@ fn link_alternative_paths(findings: &mut [Finding]) {
/// Compute containment-topological order: parent bodies before children.
///
/// Uses BFS from roots (bodies with no parent), ensuring a body is always
/// processed after its parent required for lexical seed propagation.
/// processed after its parent, required for lexical seed propagation.
/// Returns indices into `file_cfg.bodies` in processing order.
fn containment_order(bodies: &[BodyCfg]) -> Vec<usize> {
let mut children: HashMap<BodyId, Vec<usize>> = HashMap::new();
@ -637,7 +645,7 @@ fn analyse_body_with_seed(
// Per-body graphs contain only the body's own nodes.
// For non-toplevel bodies, use lower_to_ssa_with_params with scope to
// create SsaOp::Param ops for external/captured variables and formal
// parameters required for global_seed to inject taint from the parent.
// parameters, required for global_seed to inject taint from the parent.
// Top-level bodies use lower_to_ssa with scope_all=true (no Param ops).
let is_toplevel = body.meta.parent_body_id.is_none();
// JS/TS function bodies always use scoped lowering to create Param ops
@ -708,12 +716,9 @@ fn analyse_body_with_seed(
} else {
Some(static_map)
};
// Pointer-Phase 3 / W1+W2+W3: per-body field-sensitive points-to
// facts. Computed only when `NYX_POINTER_ANALYSIS=1`; the
// per-body `analyse_body` cost is amortised across the three
// hooks (W1 field-write read-back, W2 container ELEM cells,
// W3 cross-call resolver). Strict-additive: `None` keeps
// pointer-disabled behaviour bit-identical.
// Per-body field-sensitive points-to facts. Cost is
// amortised across field-write read-back, container ELEM
// cells, and the cross-call resolver.
let pointer_facts = if crate::pointer::is_enabled() {
Some(crate::pointer::analyse_body(&ssa_body, body.meta.id))
} else {
@ -836,7 +841,7 @@ fn analyse_body_with_seed(
Err(e) => {
// SSA lowering produced no analyzable body. We still surface
// the event so downstream tooling can tell "we tried and gave
// up" from "we ran clean" a TRACE-level log records the
// up" from "we ran clean", a TRACE-level log records the
// reason (no synthetic Finding is manufactured because a
// diag pointing at no source location would be misleading).
tracing::trace!(
@ -948,7 +953,7 @@ fn analyse_multi_body(
let top_cfg = &top.graph;
// Collect top-level binding keys for seed filtering. Always
// keyed under `BodyId(0)` `filter_seed_to_toplevel` matches
// keyed under `BodyId(0)`, `filter_seed_to_toplevel` matches
// by name and re-keys every surviving entry to `BodyId(0)`
// anyway, so the body_id on the probe keys is informational.
let toplevel_keys: HashSet<ssa_transfer::BindingKey> = {
@ -969,7 +974,7 @@ fn analyse_multi_body(
// re-analysis when a name it reads via Param or via the
// global_seed ancestor-lookup path has actually changed in
// the combined seed. `reads` is a superset of the body's
// top-level dependencies we err on the side of over-running
// top-level dependencies, we err on the side of over-running
// (false dirty) rather than missing a dependency.
let body_reads: HashMap<BodyId, HashSet<String>> = {
let mut m: HashMap<BodyId, HashSet<String>> = HashMap::new();
@ -1060,7 +1065,7 @@ fn analyse_multi_body(
// Re-run non-toplevel bodies with updated seed.
body_exit_states.insert(BodyId(0), current_seed.clone());
// Phase-C: Gauss-Seidel variant as each body is
// Phase-C: Gauss-Seidel variant, as each body is
// re-analysed, merge its new exit into `current_seed`
// immediately so subsequent bodies in the same round see
// the fresh value. Order matters here; we pin to
@ -1137,7 +1142,7 @@ fn analyse_multi_body(
// Record observability counter. `iters_used == 0` covers the
// non-JS/TS path (`max_iterations == 1`) and the JS/TS case where
// the convergence loop did not enter report `1` so the counter
// the convergence loop did not enter, report `1` so the counter
// always reflects "at least the lexical-containment pass ran".
let reported_iters = if iters_used == 0 { 1 } else { iters_used };
LAST_JS_TS_PASS2_ITERATIONS.store(reported_iters, Ordering::Relaxed);
@ -1287,7 +1292,7 @@ fn lookup_formal_params(local_summaries: &FuncSummaries, func_name: &str) -> Vec
/// When exactly one `(name, arity)`-matching entry exists we use its full
/// identity (container / disambig / kind preserved). When zero or multiple
/// match we fall back to a free-function key so the caller still has a
/// well-formed key this can only happen in legacy discovery paths that
/// well-formed key, this can only happen in legacy discovery paths that
/// cannot see through same-name siblings, and those paths were already
/// collision-prone before this refactor. New intra-file analysis code
/// should prefer [`BodyMeta::func_key`].
@ -1300,7 +1305,7 @@ fn lookup_canonical_func_key(
) -> FuncKey {
// `local_summaries` is file-local, so every entry's namespace agrees with
// whatever `build_cfg` wrote (raw file path). We match by lang + name +
// arity and fall back to name-only the caller's `namespace` argument is
// arity and fall back to name-only, the caller's `namespace` argument is
// only used when we have to synthesise a key as a last resort.
let mut matches = local_summaries
.keys()
@ -1372,7 +1377,7 @@ pub(crate) fn extract_intra_file_ssa_summaries(
.count()
};
// Zero-param helpers are normally elided a fixture with no
// Zero-param helpers are normally elided, a fixture with no
// parameters cannot carry per-parameter taint transforms. But
// zero-arg factories (`function makeBag() { return []; }`) do
// have one observable cross-file effect: the return is a fresh
@ -1409,7 +1414,7 @@ pub(crate) fn extract_intra_file_ssa_summaries(
// must survive this filter so summary application at cross-file
// call sites can replay the alias edges. Zero-param factories
// are kept via the `returns_fresh_alloc` leg of
// `points_to.is_empty()` `is_empty()` returns false when the
// `points_to.is_empty()`, `is_empty()` returns false when the
// fresh-alloc flag is set.
if !summary.param_to_return.is_empty()
|| !summary.param_to_sink.is_empty()
@ -1436,7 +1441,7 @@ pub(crate) fn extract_intra_file_ssa_summaries(
}
/// Lower all function bodies from `FileCfg` to produce SSA summaries + cached
/// bodies. Each body's own graph is used directly no scope filtering needed.
/// bodies. Each body's own graph is used directly, no scope filtering needed.
///
/// Both returned maps are keyed by each body's canonical [`FuncKey`] (carried
/// on [`crate::cfg::BodyMeta::func_key`]). This is the most collision-
@ -1503,7 +1508,7 @@ pub(crate) fn lower_all_functions_from_bodies(
// `build_cfg` wrote. The caller passes `namespace` already normalized
// against `scan_root`, which is what FuncSummary keys use on the
// cross-file side (`FuncSummary::func_key`). Overriding the namespace
// here keeps both sides of `GlobalSummaries` agreement otherwise
// here keeps both sides of `GlobalSummaries` agreement, otherwise
// `resolve_callee` resolves to the normalized FuncSummary key and
// misses the raw-path SSA entry.
let mut key = body.meta.func_key.clone().unwrap_or_else(|| {
@ -1542,7 +1547,7 @@ pub(crate) fn lower_all_functions_from_bodies(
// Always insert the summary, even when all fields are empty/default.
// An empty summary tells resolve_callee "this function exists and has
// no taint effects" preventing fallthrough to the less precise old
// no taint effects", preventing fallthrough to the less precise old
// FuncSummary which may report false source_caps from internal sources.
// For zero-param functions we only insert when the summary carries
// the fresh-container signal (the only observable effect worth
@ -1563,34 +1568,23 @@ pub(crate) fn lower_all_functions_from_bodies(
perf_lower_record(2, _t_opt.elapsed().as_micros());
let _t_typed = std::time::Instant::now();
// Phase 2 (typed call-graph devirtualisation): walk every SSA
// method call in this body, look up the receiver SSA value's
// [`crate::ssa::type_facts::TypeKind`] in the just-computed
// `opt.type_facts`, and record `(call_ordinal, container_name)`
// on the matching summary so Phase 3 in `build_call_graph` can
// narrow the indirect-method-call edge to the receiver-typed
// container. Free-function calls (`receiver: None`) and
// unknown receiver types are silently skipped — the bare-name
// resolution path applies unchanged in that case.
// For every SSA method call, look up the receiver's TypeKind
// and record `(call_ordinal, container_name)` so devirtualisation
// in `build_call_graph` can narrow the edge to the receiver-typed
// container. Free-function calls and unknown types fall back to
// bare-name resolution.
let typed_receivers = collect_typed_call_receivers(&func_ssa, &body.graph, &opt.type_facts);
if !typed_receivers.is_empty() {
// The summary may not have been inserted above (zero-param,
// no-fresh-alloc bodies are skipped). Force-insert in that
// case so the receiver-type info reaches Phase 3 — without
// it, the cross-file devirtualisation signal would be lost
// for any method invoked inside a parameterless caller.
// Zero-param/no-fresh-alloc bodies are skipped above;
// force-insert so receiver-type info still reaches
// build_call_graph.
let entry = summaries.entry(key.clone()).or_default();
entry.typed_call_receivers = typed_receivers;
}
// Pointer-Phase 5 / W3: populate `field_points_to` from the
// body's pointer facts when the analysis is enabled. Strict
// opt-in via `NYX_POINTER_ANALYSIS=1`; off-by-default keeps
// bit-for-bit identity with the pre-W3 behaviour.
//
// `extract_field_points_to` covers both reads (via
// `SsaOp::FieldProj` walks) and writes (via the W1
// `field_writes` side-table on the body) in a single pass.
// Populate `field_points_to` from the body's pointer facts.
// `extract_field_points_to` covers both reads (FieldProj walks)
// and writes (`field_writes` side-table) in one pass.
if crate::pointer::is_enabled() {
let facts = crate::pointer::analyse_body(&func_ssa, body.meta.id);
let fpt = crate::pointer::extract_field_points_to(&func_ssa, &facts);
@ -1621,7 +1615,7 @@ pub(crate) fn lower_all_functions_from_bodies(
// Lift child-body sinks into the parent's `param_to_sink` for
// every parent body with lexically contained children. This
// handles the direct-wrapper case
// `f(x) { return new Promise((res, rej) => sink(x)) }` the
// `f(x) { return new Promise((res, rej) => sink(x)) }`, the
// executor's gated http.get sink becomes visible to callers of
// `f` via `f.summary.param_to_sink`.
//
@ -1635,8 +1629,8 @@ pub(crate) fn lower_all_functions_from_bodies(
// propagation at summary-extraction time so cross-call
// resolution sees the sink at every caller of `f`.
//
// Strict-additive: only ADDs `param_to_sink` entries never
// removes or modifies existing data so it cannot regress
// Strict-additive: only ADDs `param_to_sink` entries, never
// removes or modifies existing data, so it cannot regress
// detection. Bounded: each parent-param probe runs each child
// body's analysis exactly once.
let _t_aug = std::time::Instant::now();
@ -1665,7 +1659,7 @@ pub(crate) fn lower_all_functions_from_bodies(
// OR-merge: only adds `param_to_sink` / `param_to_sink_param`
// entries to existing summaries. Existing entries (return
// transforms, source caps, augment-populated sinks, etc.) are
// preserved. Strict-additive cannot regress detection.
// preserved. Strict-additive, cannot regress detection.
let _t_rerun = std::time::Instant::now();
rerun_extraction_with_augmented_summaries(
file_cfg,
@ -1919,7 +1913,7 @@ fn augment_summaries_with_child_sinks(
let parent_interner = crate::state::symbol::SymbolInterner::from_cfg(parent_cfg);
// Collect (formal_param_idx, var_name, ssa_value) for the parent's
// formal params mirrors `extract_ssa_func_summary`'s param scan.
// formal params, mirrors `extract_ssa_func_summary`'s param scan.
let mut parent_param_info: Vec<(usize, String)> = Vec::new();
for block in &parent_ssa.blocks {
for inst in block.phis.iter().chain(block.body.iter()) {
@ -2055,7 +2049,7 @@ fn augment_summaries_with_child_sinks(
}
// Aggregate sink caps across all child events into one
// entry per parent param (cap-only SinkSite the
// entry per parent param (cap-only SinkSite, the
// exact location lives in the child body's CFG and is
// not directly addressable from the parent's summary).
let mut union_caps = Cap::empty();
@ -2088,7 +2082,7 @@ fn augment_summaries_with_child_sinks(
// engine's primary sink-site picker uses
// `param_to_sink_param` for arg-position filtering)
// sees this captured-flow sink. Position 0 is a
// best-effort placeholder the actual filtering at
// best-effort placeholder, the actual filtering at
// the caller is by SSRF cap, not arg position, when
// the wrapper is itself non-gated.
if !entry
@ -2109,7 +2103,7 @@ fn augment_summaries_with_child_sinks(
/// non-empty [`crate::ssa::type_facts::TypeKind::container_name`].
///
/// Free-function calls (`receiver: None`) and unknown receiver types
/// are skipped the cross-file call-graph builder will fall back to
/// are skipped, the cross-file call-graph builder will fall back to
/// today's name-only resolution for those, preserving the
/// "subset of today's targets, never a superset" invariant from
/// `docs/typed-call-graph-prompt.md`.
@ -2135,13 +2129,13 @@ fn collect_typed_call_receivers(
continue;
};
let Some(receiver_val) = receiver else {
continue; // free-function call no devirtualisation possible
continue; // free-function call, no devirtualisation possible
};
let Some(kind) = type_facts.get_type(*receiver_val) else {
continue; // type unknown fall back to name-only resolution
continue; // type unknown, fall back to name-only resolution
};
let Some(container) = kind.container_name() else {
continue; // scalar/unknown type no useful container
continue; // scalar/unknown type, no useful container
};
let Some(node_info) = cfg.node_weight(inst.cfg_node) else {
continue;
@ -2150,7 +2144,7 @@ fn collect_typed_call_receivers(
// A single SSA call instruction maps 1:1 with a CFG call
// node, so each ordinal should appear at most once. The
// dedup guard exists in case lowering ever introduces a
// second SSA Call sharing a cfg_node first wins.
// second SSA Call sharing a cfg_node, first wins.
if !seen.insert(ordinal) {
continue;
}
@ -2211,7 +2205,7 @@ pub(crate) fn build_eligible_bodies(
continue;
}
// Populate node metadata against the per-body graph whose NodeIndex
// space the SSA was produced on otherwise cross-file replay can't
// space the SSA was produced on, otherwise cross-file replay can't
// find the original CFG nodes.
//
// `key.namespace` was already normalised against `scan_root` in

View file

@ -35,13 +35,13 @@ pub enum PredicateKind {
/// Commonly paired with [`ShellMetaValidated`] in OR-chain rejection
/// idioms (`if x.len() > MAX || x.contains(";") { reject }`). Counts as
/// a dominator guard for `cfg-unguarded-sink` purposes, but intentionally
/// does **not** mark variables as validated the rejection direction is
/// does **not** mark variables as validated, the rejection direction is
/// ambiguous from the condition alone (a `.len() > 5 { sink(x) }`
/// gate is a precondition, not a rejection).
BoundedLength,
/// Comparison operators: `x == 5`, `x > threshold`
Comparison,
/// Generic boolean test cannot classify further.
/// Generic boolean test, cannot classify further.
Unknown,
}
@ -50,7 +50,7 @@ pub enum PredicateKind {
///
/// Presence of any of these in user input is sufficient to enable shell
/// injection, so rejecting input that contains them is a real sanitizer.
/// `"foo"` or other non-metachar needles don't qualify a rejection of
/// `"foo"` or other non-metachar needles don't qualify, a rejection of
/// those is business logic, not security.
const SHELL_METACHARS: &[&str] = &[";", "|", "&", "`", "$", ">", "<", "\n", "\r", "\0"];
@ -65,7 +65,7 @@ const SHELL_METACHARS: &[&str] = &[";", "|", "&", "`", "$", ">", "<", "\n", "\r"
/// character class containing only metacharacters.
///
/// Returns `false` if the needle is a non-metachar literal or cannot be
/// extracted falls through to broader classification.
/// extracted, falls through to broader classification.
fn is_shell_metachar_rejection(text: &str) -> bool {
// Method-call form: `.contains(…)` / `.includes(…)` / `.include?(…)`
for method in [".contains(", ".includes(", ".include?("] {
@ -134,7 +134,7 @@ fn extract_first_string_arg(after_open: &str) -> Option<String> {
}
/// For Python `"<METACHAR>" in x` (needle on the left side of ` in `), return
/// the needle. Returns `None` for `x in ALLOWED` (identifier on the left)
/// the needle. Returns `None` for `x in ALLOWED` (identifier on the left) ,
/// that is an allowlist check, not a rejection.
fn extract_python_in_needle(text: &str) -> Option<String> {
let pos = text.find(" in ")?;
@ -155,7 +155,7 @@ fn extract_python_in_needle(text: &str) -> Option<String> {
/// Detect regex character classes that contain only shell metacharacters:
/// `[;|&]`, `[;&`$]`, etc. Missing: escape-class metacharacters inside the
/// class (e.g. `[\n]`) conservative, returns false there.
/// class (e.g. `[\n]`), conservative, returns false there.
fn is_metachar_regex_class(text: &str) -> bool {
// Find `[` followed by content and `]`, anywhere in the text.
let mut rest = text;
@ -180,7 +180,7 @@ fn is_metachar_regex_class(text: &str) -> bool {
/// Check whether `text` looks like a bounded-length rejection:
/// `x.len() > N`, `x.len() < N`, `x.length >= N`, etc. where `N` is an
/// integer literal >= 2. Excludes `> 0` / `>= 1` / `< 1` those are
/// integer literal >= 2. Excludes `> 0` / `>= 1` / `< 1`, those are
/// non-empty checks, which are not length-bound validations.
fn is_bounded_length_check(lower: &str) -> bool {
const PROBES: &[&str] = &[
@ -290,7 +290,7 @@ pub fn classify_condition(text: &str) -> PredicateKind {
// Matched BEFORE AllowlistCheck so that `x.contains(";")` is recognized
// as a rejection idiom rather than a membership test. Checked on the
// raw (non-lowercased) text so metacharacter comparisons stay
// case-accurate `;` / `|` / `&` have no case.
// case-accurate, `;` / `|` / `&` have no case.
if is_shell_metachar_rejection(text) {
return PredicateKind::ShellMetaValidated;
}
@ -409,7 +409,7 @@ pub fn classify_condition(text: &str) -> PredicateKind {
/// validator's effect is opaque: we can't tell which argument is being
/// checked. Returning the original kind with `None` target would cause
/// upstream code to over-validate (mark every `condition_var` as validated).
/// Instead, we fall back to `PredicateKind::Unknown` safer to assume the
/// Instead, we fall back to `PredicateKind::Unknown`, safer to assume the
/// validator did nothing than to assume it validated every variable in the
/// condition. Single-argument calls retain `(kind, None)` so downstream code
/// can still use the predicate-summary bit tracking.
@ -442,7 +442,7 @@ pub fn classify_condition_with_target(text: &str) -> (PredicateKind, Option<Stri
(kind, target)
}
PredicateKind::Comparison => {
// `x === '/login'`, `x == 5`, `null != obj` when exactly one
// `x === '/login'`, `x == 5`, `null != obj`, when exactly one
// side is a literal, extract the identifier side as the target.
// Downstream `apply_branch_predicates` uses this to mark the
// variable as `validated_may` on the true (equal) branch.
@ -464,7 +464,7 @@ pub fn classify_condition_with_target(text: &str) -> (PredicateKind, Option<Stri
/// - `'a' == 'b'` → `None` (both sides are literals)
/// - `obj.field == 3` → `None` (not a bare identifier)
///
/// Best-effort text analysis kept conservative to avoid false validation.
/// Best-effort text analysis, kept conservative to avoid false validation.
fn extract_comparison_target(text: &str) -> Option<String> {
let trimmed = text.trim();
@ -537,7 +537,7 @@ fn is_comparison_literal(s: &str) -> bool {
/// `Some(0)` for a call with empty argument list. Respects paren/bracket/brace
/// nesting so `f(g(a, b), c)` counts as 2 top-level args.
///
/// Best-effort operates on source text, not an AST. Used by
/// Best-effort, operates on source text, not an AST. Used by
/// `classify_condition_with_target` to distinguish single-arg vs multi-arg
/// validator calls when target extraction fails.
fn count_call_args(text: &str) -> Option<usize> {
@ -592,7 +592,7 @@ fn extract_validation_target(text: &str) -> Option<String> {
}
}
// Function call pattern: `func(x, ...)` extract first argument
// Function call pattern: `func(x, ...)`, extract first argument
// Strip closing paren if present
let args_inner = args_part.trim_end().strip_suffix(')').unwrap_or(args_part);
// Take text up to first comma (first argument)
@ -653,7 +653,7 @@ fn extract_allowlist_target(text: &str) -> Option<String> {
// Python `in` operator: `cmd in ALLOWED` / `cmd not in ALLOWED`
if lower.contains(" in ") {
// Find the leftmost ` in ` everything before it is the target expression
// Find the leftmost ` in `, everything before it is the target expression
// Handle `not in` by looking for ` not in ` first
let target_part = if let Some(pos) = lower.find(" not in ") {
&trimmed[..pos]
@ -857,7 +857,7 @@ mod tests {
#[test]
fn classify_validation_requires_paren() {
// `x_valid == true` should NOT be ValidationCall no `(` call syntax.
// `x_valid == true` should NOT be ValidationCall, no `(` call syntax.
assert_eq!(
classify_condition("x_valid == true"),
PredicateKind::Comparison
@ -978,7 +978,7 @@ mod tests {
#[test]
fn target_multi_arg_fallback_opaque_expr_is_unknown() {
// `validate(x + 1, y)` first arg is an expression, not an identifier.
// `validate(x + 1, y)`, first arg is an expression, not an identifier.
// Target extraction fails. Multi-arg call, so fall back to Unknown
// rather than letting upstream validate every condition var.
let (kind, target) = classify_condition_with_target("validate(x + 1, y)");

View file

@ -1,9 +1,9 @@
//! Taint event emission and conversion to [`crate::taint::Finding`].
//!
//! Extracted from the monolithic `ssa_transfer.rs`. Contains:
//! * [`SsaTaintEvent`] the raw event struct produced by the block-level
//! * [`SsaTaintEvent`], the raw event struct produced by the block-level
//! worklist each time a tainted value reaches a sink.
//! * [`ssa_events_to_findings`] event → `Finding` conversion with the
//! * [`ssa_events_to_findings`], event → `Finding` conversion with the
//! `primary_location` invariant and dedup.
//! * Flow-path reconstruction helpers ([`reconstruct_flow_path`] and
//! operand pickers).
@ -38,14 +38,14 @@ pub struct SsaTaintEvent {
/// `sink_caps`. When multiple [`SinkSite`]s for the same `(param_idx,
/// cap mask)` match, the emission site produces one event per
/// [`SinkSite`] so each downstream [`crate::taint::Finding`] carries a
/// single primary attribution the multi-primary case collapses to
/// single primary attribution, the multi-primary case collapses to
/// multiple single-primary events.
///
/// `None` for:
/// * intra-procedural sinks (`uses_summary == false`), where the
/// caller's sink span already names the dangerous instruction;
/// * summary-resolved sinks whose callee summary carried only cap-only
/// [`SinkSite`]s (no source coordinates e.g. pass-2 transient
/// [`SinkSite`]s (no source coordinates, e.g. pass-2 transient
/// summaries or local `LocalFuncSummary`-only callees).
pub primary_sink_site: Option<SinkSite>,
}
@ -79,7 +79,7 @@ pub(super) fn block_distance(ssa: &SsaBody, source_node: NodeIndex, sink_node: N
}
}
}
0 // unreachable or not connected conservative default
0 // unreachable or not connected, conservative default
}
// ── Flow Path Reconstruction ─────────────────────────────────────────────
@ -204,7 +204,7 @@ pub(super) fn reconstruct_flow_path(
SsaOp::FieldProj { receiver, .. } => {
// Treat field projection as a one-step assignment for
// flow-step reconstruction: taint reaching `obj.f` came
// from `obj`. Phase 4 will refine the witness rendering
// from `obj`. the analysis may refine the witness rendering
// to include the field name in the step.
steps.push(FlowStepRaw {
cfg_node: inst.cfg_node,
@ -270,7 +270,7 @@ fn pick_tainted_operand_call(
///
/// Note: this invariant is intentionally independent of `uses_summary`.
/// The taint-chain flag tracks summary-propagated *taint*, not summary-
/// resolved *sinks* a local source can reach a cross-file sink, so
/// resolved *sinks*, a local source can reach a cross-file sink, so
/// `primary_location.is_some()` does not imply `uses_summary == true`.
pub fn ssa_events_to_findings(
events: &[SsaTaintEvent],
@ -329,7 +329,7 @@ pub fn ssa_events_to_findings(
// Data-integrity invariant: a populated primary_location must at least
// carry resolved line coordinates. `file_rel` may legitimately be
// empty when the scan root is the caller file itself (single-file
// empty, when the scan root is the caller file itself (single-file
// scans), every namespace normalizes to `""` and the callee's site
// inherits that empty path; consumers resolve it against the file
// under analysis. Line==0 is the only filter-worthy invariant.
@ -340,7 +340,7 @@ pub fn ssa_events_to_findings(
// Dedup key includes primary location so multi-site events that
// share a single (source, sink) pair still produce distinct findings
// one per resolved callee-internal site.
//, one per resolved callee-internal site.
let loc_key = primary_location
.as_ref()
.map(|l| (l.file_rel.clone(), l.line, l.col));
@ -374,6 +374,11 @@ pub fn ssa_events_to_findings(
path_hash,
finding_id: String::new(),
alternative_finding_ids: smallvec::SmallVec::new(),
// Per-event mask from the multi-gate dispatch, picks
// exactly the cap that fired (e.g. `Cap::DATA_EXFIL`
// for a `fetch` body-flow finding versus `Cap::SSRF`
// for a URL-flow finding on the same call).
effective_sink_caps: event.sink_caps & *caps,
});
}
}

View file

@ -1,34 +1,10 @@
//! Context-sensitive inline analysis cache, body, and attribution types.
//! Context-sensitive inline analysis, cache, body, and attribution types.
//!
//! Extracted from the monolithic `ssa_transfer.rs`. Contains:
//! * [`ArgTaintSig`] — compact per-arg cap signature used as a cache key.
//! * [`InlineResult`] / [`CachedInlineShape`] / [`ReturnShape`] — the
//! callsite-adapted and callsite-agnostic inline-analysis result types.
//! * [`InlineCache`] — the shared cache map keyed by
//! `(FuncKey, ArgTaintSig)`.
//! * [`CrossFileNodeMeta`] / [`CalleeSsaBody`] — the serde-able bodies
//! persisted to SQLite for cross-file context-sensitive analysis.
//! * [`populate_node_meta`] / [`rebuild_body_graph`] — bookkeeping for
//! cross-file body proxy CFGs.
//!
//! The implementation functions (`inline_analyse_callee`,
//! `apply_cached_shape`, `extract_inline_return_taint`) remain in the
//! parent `mod.rs` because they depend tightly on the block worklist, the
//! `run_ssa_taint_full` entry point, and the callee-resolution pipeline.
//!
//! # Cache key scope and origin attribution
//!
//! The inline-analysis cache below ([`InlineCache`]) is keyed by
//! `(FuncKey, ArgTaintSig)`, where [`ArgTaintSig`] encodes **per-arg
//! capability bits only** — not the identity of the source
//! [`crate::taint::domain::TaintOrigin`]s that produced those caps. The
//! stored value ([`CachedInlineShape`]) captures **only the structural**
//! shape of the callee's return taint: return caps, callee-internal
//! origins (from `Source` ops inside the callee body), and per-parameter
//! provenance flags that record which formal parameters contributed to
//! the return. Caller-specific origin identity is *not* stored — it is
//! re-attributed at cache-apply time from the current call site's
//! argument taint.
//! The cache ([`InlineCache`]) is keyed by `(FuncKey, ArgTaintSig)`,
//! where [`ArgTaintSig`] is per-arg cap bits only (not origin identity).
//! Stored values ([`CachedInlineShape`]) capture the structural shape of
//! the callee's return taint; caller-specific origins are re-attributed
//! at apply time.
use crate::labels::Cap;
use crate::ssa::ir::{SsaBody, Terminator};
@ -42,61 +18,30 @@ use std::collections::HashMap;
/// Maximum SSA blocks in a callee body before skipping inline analysis.
pub(super) const MAX_INLINE_BLOCKS: usize = 500;
/// Compact cache key: per-arg-position cap bits (sorted, non-empty only).
///
/// Two calls with identical `ArgTaintSig` produce identical inline results
/// for soundness purposes (return caps, callee-internal sink activations).
/// Origin identity is **not** part of the key — see the module-level note
/// above on origin-attribution non-determinism.
/// Compact cache key: per-arg-position cap bits (sorted, non-empty
/// only). Origin identity is not part of the key.
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub(crate) struct ArgTaintSig(pub(super) SmallVec<[(usize, u16); 4]>);
/// Call-site-adapted result of inline-analyzing a callee.
///
/// Constructed fresh per call site by `apply_cached_shape` from a stored
/// [`CachedInlineShape`]; carries origins that point to the *current*
/// caller's source chain, not to whichever caller first populated the
/// cache entry.
/// Call-site-adapted result of inline-analyzing a callee. Built fresh
/// per call site so origins point to the current caller's chain.
#[derive(Clone, Debug)]
pub(crate) struct InlineResult {
/// Taint on the return value after inline analysis.
pub(super) return_taint: Option<VarTaint>,
/// PathFact on the return value after inline analysis.
///
/// Non-top when the callee's body provably narrows the
/// [`crate::abstract_interp::PathFact`] of the value it returns (for
/// example, a `sanitize_path(s) -> Option<String>` helper that
/// early-returns on `s.contains("..")` / `s.starts_with('/')`). At
/// apply time the caller sets its call-result SSA value's PathFact to
/// this narrowed fact, so downstream FILE_IO sinks see the sanitised
/// axis regardless of whether a named label-rule exists for the
/// helper. Top when the callee produces no narrowing — matches
/// pre-PathFact behaviour exactly.
/// PathFact on the return value. Non-top when the callee body
/// provably narrows it (e.g. a `sanitize_path` early-returning on
/// `s.contains("..")`).
pub(super) return_path_fact: crate::abstract_interp::PathFact,
/// Per-return-path decomposition of [`Self::return_path_fact`].
///
/// Non-empty when the callee has ≥2 distinct return blocks whose
/// predicate gates differ. Match-arm-sensitive callers pick the
/// entry whose `variant_inner_fact` matches the arm binding's
/// variant; path-resolvable callers may refuse infeasible entries.
/// Callers unable to distinguish paths still consult
/// [`Self::return_path_fact`] (the join of all entries) and see
/// pre-decomposition behaviour.
/// Per-return-path decomposition of `return_path_fact`. Non-empty
/// when the callee has ≥2 return blocks with different predicate
/// gates.
#[allow(dead_code)]
pub(super) return_path_facts: SmallVec<[PathFactReturnEntry; 2]>,
}
/// Structural (callsite-agnostic) summary of an inline-analyzed callee.
///
/// Stored in [`InlineCache`] in place of a fully-attributed `InlineResult`.
/// Origin-identity information that depends on the caller's argument chain
/// is *not* kept here; instead, [`ReturnShape::param_provenance`]
/// records which callee parameter positions contributed seed taint to the
/// return, and the actual caller origins are re-unioned in at apply time.
///
/// `None` means "this callee produced no return taint for the given
/// argument shape". A cached `None` is still a meaningful result — it
/// short-circuits re-analysis on subsequent calls with matching caps.
/// Structural (callsite-agnostic) summary of an inline-analyzed
/// callee. `None` means "no return taint for this arg shape", still
/// meaningful, short-circuits subsequent calls with matching caps.
#[derive(Clone, Debug)]
pub(crate) struct CachedInlineShape(pub(super) Option<ReturnShape>);
@ -107,7 +52,7 @@ pub(crate) struct CachedInlineShape(pub(super) Option<ReturnShape>);
/// origins. See the module-level note above on origin attribution.
#[derive(Clone, Debug)]
pub(crate) struct ReturnShape {
/// Return value caps (cap bits only structural).
/// Return value caps (cap bits only, structural).
pub(super) caps: Cap,
/// Origins produced **inside the callee body** (e.g. `Source` op fired
/// in the callee). `node` is set to a placeholder; at apply time the
@ -115,31 +60,19 @@ pub(crate) struct ReturnShape {
/// stable (from the callee CFG) and preserved as-is.
pub(super) internal_origins: SmallVec<[TaintOrigin; 2]>,
/// Bit i set = callee's `Param(i)` seed taint reached the return value.
/// At apply time, caller's argument origins at matching positions are
/// unioned into the applied `VarTaint`. Params beyond index 63 are
/// dropped (matching `SmallBitSet` semantics); the capped case is rare
/// and still yields cap-correct results.
/// At apply time, caller arg origins at matching positions are
/// unioned into the applied `VarTaint`. Params beyond 63 are
/// dropped (matches `SmallBitSet`); rare and still cap-correct.
pub(super) param_provenance: u64,
/// Whether the receiver (`SelfParam`) seed taint flowed to the return.
/// Whether the receiver (`SelfParam`) seed taint flowed to return.
pub(super) receiver_provenance: bool,
/// Whether the applied `VarTaint` should be tagged `uses_summary`.
pub(super) uses_summary: bool,
/// PathFact of the return value observed from the callee's exit
/// abstract state. Cache-safe because the callee is inline-analysed
/// with [`crate::abstract_interp::PathFact::top`] Param seeds — the
/// resulting fact describes the callee's intrinsic narrowing (e.g.
/// the `Some` arm of a `sanitize(..) -> Option<String>` body
/// proves `dotdot = No`) and does not depend on caller-side
/// narrowing of the argument's PathFact. Top when the callee does
/// not narrow.
/// PathFact of the return value, observed from the callee exit
/// state under Top-seeded Params. Describes the callee's intrinsic
/// narrowing.
pub(super) return_path_fact: crate::abstract_interp::PathFact,
/// Per-return-path [`PathFact`] decomposition of the return value.
///
/// Populated alongside [`Self::return_path_fact`] when the callee
/// has ≥2 distinct return blocks with different predicate gates.
/// Cache-safe for the same reason as `return_path_fact`: entries
/// describe callee-intrinsic narrowing under Top-seeded Params.
/// Empty when no per-path distinction was observed.
/// Per-return-path decomposition of the return value. Populated
/// when the callee has ≥2 return blocks with different predicates.
pub(super) return_path_facts: SmallVec<[PathFactReturnEntry; 2]>,
}
@ -151,50 +84,21 @@ impl CachedInlineShape {
}
}
/// Cache for context-sensitive inline analysis results.
///
/// Keyed by the callee's canonical [`FuncKey`] rather than a bare function
/// name so that same-name definitions (e.g. two `process/1` methods on
/// different classes in the same file) never share or overwrite each
/// other's cache entries. Values are stored as [`CachedInlineShape`]; see
/// the module-level note above for why origins are stripped from the
/// cache value and re-attributed at apply time.
/// Cache for context-sensitive inline analysis results, keyed by
/// canonical [`FuncKey`] so same-name definitions in different scopes
/// never collide.
pub(crate) type InlineCache = HashMap<(FuncKey, ArgTaintSig), CachedInlineShape>;
/// Drop every entry from an inline cache, marking the start of a new
/// convergence epoch.
///
/// Cross-file SCC fixed-point iteration runs pass 2 repeatedly until the
/// merged summaries stop changing. Between iterations the callee-summary
/// inputs to inline analysis may have changed, so results cached under a
/// stale snapshot must not leak into the next iteration — otherwise the
/// engine could converge to a non-fixed-point (reporting a taint result
/// that would not reproduce on a fresh run of the same file order).
///
/// The per-file inline cache is already reconstructed fresh at the top of
/// each [`crate::taint::analyse_file`] call, so in the current code this
/// call is effectively a no-op plumbing hook. Keeping the method (instead
/// of relying on ambient re-construction) makes the lifecycle explicit for
/// any future refactor that moves the cache up into the SCC orchestrator.
#[allow(dead_code)] // semantic hook; used by tests and future shared-cache refactor
/// Drop every entry from the inline cache between SCC fixpoint
/// iterations so stale results don't leak forward.
#[allow(dead_code)]
pub(crate) fn inline_cache_clear_epoch(cache: &mut InlineCache) {
cache.clear();
}
/// Set-equal fingerprint of an inline cache, used by the SCC orchestrator
/// to detect when cross-file inline analysis has reached a fixed point
/// alongside summary convergence.
///
/// Returns a `HashMap` mapping each `(FuncKey, ArgTaintSig)` cache key to
/// the return-value capability bits of its inline result. `HashMap`
/// equality is set-equal (unordered), so two caches with the same entries
/// compare equal regardless of insertion order.
///
/// Origins are intentionally omitted — they are non-deterministic across
/// callers with identical caps (see the module-level note on origin
/// attribution) and would cause the fingerprint to oscillate without
/// reflecting a real precision change.
#[allow(dead_code)] // observability hook; used by tests and future shared-cache refactor
/// Set-equal fingerprint of the inline cache, used by the SCC
/// orchestrator to detect convergence.
#[allow(dead_code)]
pub(crate) fn inline_cache_fingerprint(
cache: &InlineCache,
) -> HashMap<(FuncKey, ArgTaintSig), u16> {
@ -206,24 +110,11 @@ pub(crate) fn inline_cache_fingerprint(
/// CFG node metadata embedded in cross-file callee bodies.
///
/// ## Why a full [`crate::cfg::NodeInfo`] lives here
///
/// An earlier variant carried only the two fields the symex executor reads
/// (`bin_op`, `labels`). That was sufficient for symex but not for the
/// taint engine, which reads ~20 fields off `cfg[inst.cfg_node]` across
/// `transfer_inst`, `collect_block_events`, `compute_succ_states`, and
/// helpers (callee name, `arg_uses`, `arg_callees`, `call_ordinal`,
/// `outer_callee`, `kwargs`, `arg_string_literals`, `ast.span`,
/// `ast.enclosing_func`, `condition_*`, `all_args_literal`, `catch_param`,
/// `parameterized_query`, `in_defer`, `cast_target_type`, `string_prefix`,
/// `taint.uses`, `taint.defines`, `taint.extra_defines`,
/// `taint.const_text`, …). Rather than shuttling each of those through a
/// `CfgView` accessor at every callsite, we store a full serde-able
/// [`crate::cfg::NodeInfo`] snapshot here so the indexed-scan path can
/// rehydrate an equivalent `Cfg` on load (see [`rebuild_body_graph`]).
/// Both scan paths then feed the same `&Cfg` into the taint engine, and
/// cross-file inline fires regardless of whether the body came from pass
/// 1 or from SQLite.
/// Stores a full serde-able [`crate::cfg::NodeInfo`] snapshot rather
/// than projecting individual fields, so the indexed-scan path can
/// rehydrate an equivalent `Cfg` (see [`rebuild_body_graph`]) and feed
/// the same `&Cfg` into the taint engine regardless of whether the
/// body came from pass 1 or SQLite.
#[derive(Clone, Debug, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct CrossFileNodeMeta {
/// Full `NodeInfo` snapshot for this body-local NodeIndex.
@ -268,7 +159,7 @@ pub fn populate_node_meta(body: &mut CalleeSsaBody, cfg: &crate::cfg::Cfg) -> bo
// `compute_succ_states` via `cfg[*cond]`, so without it the synthesized
// cross-file proxy CFG (`rebuild_body_graph`) ends up too small whenever
// the callee body has any conditional branch whose `cond` index sits
// past the maximum `inst.cfg_node` index inline analysis then panics
// past the maximum `inst.cfg_node` index, inline analysis then panics
// with an out-of-bounds index.
let mut referenced: Vec<NodeIndex> = Vec::new();
for block in &body.ssa.blocks {
@ -320,7 +211,7 @@ pub fn rebuild_body_graph(body: &mut CalleeSsaBody) -> bool {
// index. We fill any unreferenced intermediate indices with
// `NodeInfo::default()`.
//
// Walks both instruction `cfg_node`s and `Terminator::Branch.cond`
// Walks both instruction `cfg_node`s and `Terminator::Branch.cond` ,
// the latter is read by `compute_succ_states` via `cfg[*cond]`, so
// missing it produces an OOB panic when a conditional branch's cond
// node has a higher index than any `inst.cfg_node` in the body.
@ -339,7 +230,7 @@ pub fn rebuild_body_graph(body: &mut CalleeSsaBody) -> bool {
}
}
}
// Also consider node_meta keys they should be a subset of the
// Also consider node_meta keys, they should be a subset of the
// SSA-referenced indices, but be defensive.
for &k in body.node_meta.keys() {
if k > max_idx {

File diff suppressed because it is too large Load diff

View file

@ -2,7 +2,7 @@
//! the original monolithic `ssa_transfer.rs`.
//!
//! Contains:
//! * [`SsaTaintState`] the per-block lattice value with `values`,
//! * [`SsaTaintState`], the per-block lattice value with `values`,
//! `validated_must`/`validated_may`, `predicates`, `heap`, `path_env`,
//! `abstract_state`.
//! * [`BindingKey`] / [`seed_lookup`] for cross-body taint seeding.
@ -25,7 +25,7 @@ use std::collections::HashMap;
// NOTE: The per-SSA-value origin cap used to be a hardcoded
// `MAX_ORIGINS: usize = 4`. It is now governed by the stable
// `analysis.engine.max_origins` option (default `32`) see
// `analysis.engine.max_origins` option (default `32`), see
// `crate::utils::analysis_options` and [`effective_max_origins`]. The
// test-only override below still short-circuits the config read so
// `engine_notes_tests.rs` can force a tiny cap to trigger truncation
@ -42,7 +42,7 @@ static WORKLIST_CAP_OVERRIDE: std::sync::atomic::AtomicUsize =
std::sync::atomic::AtomicUsize::new(0);
/// Records the MAX iteration count observed across every
/// `run_ssa_taint_full` call since the most recent reset. Cheaper and
/// more useful for regression tests than the last-call value a cap
/// more useful for regression tests than the last-call value, a cap
/// hit anywhere in the scan is remembered.
pub(super) static MAX_WORKLIST_ITERATIONS: std::sync::atomic::AtomicUsize =
std::sync::atomic::AtomicUsize::new(0);
@ -90,7 +90,7 @@ pub fn reset_worklist_observability() {
/// force `OriginsTruncated` emission on small fixtures.
static MAX_ORIGINS_OVERRIDE: std::sync::atomic::AtomicUsize =
std::sync::atomic::AtomicUsize::new(0);
/// Total number of origins dropped since the most recent reset captured
/// Total number of origins dropped since the most recent reset, captured
/// from `merge_origins` and the post-hoc saturation scan. Used by tests
/// to detect truncation events that don't propagate to a finding (e.g.
/// when the cap is so tight no taint flow survives to emit a sink event).
@ -136,7 +136,7 @@ pub fn reset_origins_observability() {
thread_local! {
/// Per-body engine-note collector. Cleared at the start of each
/// `analyse_body_with_seed` invocation and drained after
/// `run_ssa_taint_full` returns notes are then attached to every
/// `run_ssa_taint_full` returns, notes are then attached to every
/// finding emitted from that body. Living as a thread-local avoids
/// threading a `&RefCell` through the nearly-10-argument transfer
/// struct; inline analysis recursion is intentionally allowed to
@ -148,7 +148,7 @@ thread_local! {
/// was suppressed by an SSA-engine path-safety proof (PathFact
/// `dotdot=No && absolute=No`). Populated by `is_path_safe_for_sink`
/// and consumed by the state-analysis pass to suppress
/// `state-unauthed-access` on the same sink when the taint engine
/// `state-unauthed-access` on the same sink, when the taint engine
/// has already proved the user-controlled input cannot escape into a
/// privileged location, the auth concern on that sink is reduced.
/// Reset at start of `analyse_file`, drained before state analysis.
@ -156,7 +156,7 @@ thread_local! {
RefCell::new(std::collections::HashSet::new());
/// File-level set of CFG sink spans where the SSA engine emitted an
/// `all_validated` event every tainted input to the sink passed
/// `all_validated` event, every tainted input to the sink passed
/// through a recognised validation/sanitisation predicate before
/// reaching it. Distinct from `PATH_SAFE_SUPPRESSED_SPANS`, which
/// is FILE_IO-scoped and feeds state analysis: this set is
@ -167,7 +167,7 @@ thread_local! {
///
/// Without this signal the suppression gate has to fall back to
/// "function emitted at least one taint-unsanitised-flow finding"
/// or "function contains a labelled Sanitizer node" both of
/// or "function contains a labelled Sanitizer node", both of
/// which miss validated/dominated/early-return safety where the
/// engine cleared the flow without firing or hitting an explicit
/// sanitiser.
@ -227,7 +227,7 @@ pub fn take_path_safe_suppressed_spans() -> std::collections::HashSet<(usize, us
/// Record a sink CFG-node span where the SSA engine proved every
/// tainted input was validated (`SsaTaintEvent::all_validated`).
/// Cap-agnostic fires for any sink the engine evaluated and cleared.
/// Cap-agnostic, fires for any sink the engine evaluated and cleared.
/// Consumed by `TaintSuppressionCtx::build` as positive evidence that
/// taint analysis reached this line and proved safety, so AST-pattern
/// findings on the same line can be suppressed without misclassifying
@ -263,7 +263,7 @@ pub fn take_all_validated_spans() -> std::collections::HashSet<(usize, usize)> {
/// into the seed map always specify the owning body's id; readers look
/// up by the scope they know they want (typically their own
/// `parent_body_id`, with a fallback to `BodyId(0)` for entries that
/// the JS/TS two-level solve has re-keyed onto the top-level scope
/// the JS/TS two-level solve has re-keyed onto the top-level scope ,
/// see [`crate::taint::ssa_transfer::filter_seed_to_toplevel`]).
#[derive(Debug, Clone, Hash, Eq, PartialEq)]
pub struct BindingKey {
@ -284,7 +284,7 @@ impl BindingKey {
/// Look up a binding in a seed map.
///
/// Thin wrapper over [`HashMap::get`] retained for call-site readability
/// every seed entry is now exactly scoped to a single `(name,
///, every seed entry is now exactly scoped to a single `(name,
/// BodyId)`, so the lookup is O(1) with no fallback. Writers that want
/// cross-scope reachability must explicitly re-key their entries (see
/// [`crate::taint::ssa_transfer::filter_seed_to_toplevel`]).
@ -299,7 +299,7 @@ pub fn seed_lookup<'a>(
/// Compact key for a heap-field taint cell.
///
/// `(loc, field)` `loc` is the abstract location of the *parent*
/// `(loc, field)`, `loc` is the abstract location of the *parent*
/// (interned by the body's [`crate::pointer::LocInterner`]), `field`
/// is the [`FieldId`] of the projected field. The pair survives lattice
/// joins / leq comparisons by `Ord`-derived sort.
@ -309,16 +309,16 @@ pub struct FieldTaintKey {
pub field: FieldId,
}
/// Pointer-Phase 4 / W4: per-field-cell taint record.
/// per-field-cell taint record.
///
/// Carries the union of writers' taint for the abstract field cell plus
/// two validation channels:
/// * `validated_must` set when *every* writer recorded a value that was
/// * `validated_must`, set when *every* writer recorded a value that was
/// `validated_must` in its own SSA scope. Lattice join intersects
/// (`AND`) matching the symbol-keyed [`SsaTaintState::validated_must`]
/// (`AND`), matching the symbol-keyed [`SsaTaintState::validated_must`]
/// semantics for "validated on every path".
/// * `validated_may` set when *any* writer recorded a `validated_may`
/// value. Lattice join unions (`OR`) matching the symbol-keyed
/// * `validated_may`, set when *any* writer recorded a `validated_may`
/// value. Lattice join unions (`OR`), matching the symbol-keyed
/// [`SsaTaintState::validated_may`] semantics for "validated on some
/// path".
///
@ -332,7 +332,7 @@ pub struct FieldCell {
}
impl FieldCell {
/// Construct a cell with no validation bits convenience for the
/// Construct a cell with no validation bits, convenience for the
/// pre-W4 callers that don't propagate symbol-level validation.
pub fn unvalidated(taint: VarTaint) -> Self {
Self {
@ -365,17 +365,17 @@ pub struct SsaTaintState {
/// interpretation is disabled (`analysis.engine.abstract_interpretation
/// = false`).
pub abstract_state: Option<AbstractState>,
/// Pointer-Phase 3: per-heap-field taint cells, keyed by
/// per-heap-field taint cells, keyed by
/// `(parent_loc, field)`. Sorted by [`FieldTaintKey`] for O(n)
/// merge-join. Populated only when the body's
/// [`crate::pointer::PointsToFacts`] is available
/// (`NYX_POINTER_ANALYSIS=1`); empty otherwise so the lattice join
/// is a strict no-op for pointer-disabled runs. Field reads
/// (`SsaOp::FieldProj`) consult the cells; field writes record into
/// them. Cross-call propagation lands in Phase 5 via the
/// them. Cross-call propagation lands during lowering via the
/// field-granularity `PointsToSummary`.
///
/// Cell shape (Phase 4 / W4): [`FieldCell`] carries `taint` plus
/// Cell shape: [`FieldCell`] carries `taint` plus
/// `validated_must` / `validated_may` flags so validation flows
/// through abstract field / element identity.
pub field_taint: SmallVec<[(FieldTaintKey, FieldCell); 4]>,
@ -403,7 +403,7 @@ impl SsaTaintState {
}
}
/// Pointer-Phase 3: read the field cell at `key`. Returns `None`
/// read the field cell at `key`. Returns `None`
/// when no cell has been recorded (caller should treat as
/// untainted). O(log n) on the sorted [`field_taint`] list.
pub fn get_field(&self, key: FieldTaintKey) -> Option<&FieldCell> {
@ -413,13 +413,13 @@ impl SsaTaintState {
.map(|idx| &self.field_taint[idx].1)
}
/// Pointer-Phase 3 / W4: union `t` into the field cell at `key`,
/// union `t` into the field cell at `key`,
/// recording per-write `validated_must` / `validated_may` channels.
///
/// Maintains sorted invariant. No-op when `t.caps` is empty (so the
/// lattice bottom stays `[]`). When the cell already exists, the
/// validation channels merge with the lattice-join semantics
/// `must` AND-intersects, `may` OR-unions matching the symbol-
/// validation channels merge with the lattice-join semantics ,
/// `must` AND-intersects, `may` OR-unions, matching the symbol-
/// keyed [`SsaTaintState::validated_must`] / `validated_may`
/// semantics so a write coming through a non-validated path tears
/// down `must` while preserving `may` of any earlier validated path.
@ -563,15 +563,15 @@ impl Lattice for SsaTaintState {
}
}
/// Pointer-Phase 3 / W4: merge-join two sorted `field_taint` lists.
/// merge-join two sorted `field_taint` lists.
/// Same shape as [`merge_join_ssa_vars`] but keyed on [`FieldTaintKey`]:
/// * `taint.caps` OR-union
/// * `taint.origins` merged with cap-respecting de-dup
/// * `taint.uses_summary` OR-union
/// * `validated_must` AND-intersect (matches the symbol-keyed
/// * `taint.caps` , OR-union
/// * `taint.origins`, merged with cap-respecting de-dup
/// * `taint.uses_summary`, OR-union
/// * `validated_must`, AND-intersect (matches the symbol-keyed
/// `validated_must` lattice: a path that didn't validate this cell
/// breaks the invariant)
/// * `validated_may` OR-union (any path's validation contributes)
/// * `validated_may`, OR-union (any path's validation contributes)
pub(super) fn merge_join_field_taint(
a: &[(FieldTaintKey, FieldCell)],
b: &[(FieldTaintKey, FieldCell)],
@ -581,7 +581,7 @@ pub(super) fn merge_join_field_taint(
while i < a.len() && j < b.len() {
match a[i].0.cmp(&b[j].0) {
std::cmp::Ordering::Less => {
// Cell present only in `a` counterpart in `b` is the
// Cell present only in `a`, counterpart in `b` is the
// lattice bottom (no validation, no taint), so:
// must = a.must AND false = false
// may = a.may OR false = a.may
@ -637,11 +637,11 @@ pub(super) fn merge_join_field_taint(
/// `a ≤ b` for sorted `field_taint` lists. Used by the convergence
/// check in [`Lattice::leq`]. Per-cell criteria:
///
/// * `taint.caps` `a ⊆ b` (sub-state on caps; matches per-SSA-value
/// * `taint.caps`, `a ⊆ b` (sub-state on caps; matches per-SSA-value
/// `ssa_vars_leq`).
/// * `validated_must` `a.must ⊇ b.must` (super-state on must; same
/// * `validated_must`, `a.must ⊇ b.must` (super-state on must; same
/// shape as the symbol-keyed `validated_must` leq).
/// * `validated_may` `a.may ⊆ b.may` (sub-state on may).
/// * `validated_may`, `a.may ⊆ b.may` (sub-state on may).
///
/// When `b` lacks a key present in `a`, `b`'s side is the lattice
/// bottom: no caps, no validation. `a`'s caps must also be empty
@ -669,12 +669,12 @@ pub(super) fn field_taint_leq(
if (ca.taint.caps - cb.taint.caps).bits() != 0 {
return false;
}
// Must: a ⊇ b every must-validated key in b is must-validated
// Must: a ⊇ b, every must-validated key in b is must-validated
// in a. Equivalently: !cb.must OR ca.must.
if cb.validated_must && !ca.validated_must {
return false;
}
// May: a ⊆ b every may-validated key in a is may-validated
// May: a ⊆ b, every may-validated key in a is may-validated
// in b. Equivalently: !ca.may OR cb.may.
if ca.validated_may && !cb.validated_may {
return false;
@ -735,7 +735,7 @@ pub(super) fn merge_join_ssa_vars(
///
/// Ordering is lexicographic over
/// `(source_span_start, source_span_end, source_kind_tag, node_index)`.
/// `source_span` is the most stable component across bodies cross-body
/// `source_span` is the most stable component across bodies, cross-body
/// remapped origins carry the original byte span explicitly; intra-body
/// origins default to `(0, 0)` and fall through to the secondary keys.
///
@ -760,7 +760,7 @@ fn origin_sort_key(o: &TaintOrigin) -> (usize, usize, u8, usize) {
/// Bounded, deterministic insertion of an origin into a sorted origin
/// set. Returns `true` when `new` was admitted (or de-duplicated against
/// an existing entry), `false` when the cap forced a drop. On drop,
/// the origin with the *largest* sort key is evicted first the caller
/// the origin with the *largest* sort key is evicted first, the caller
/// sees a survivor set that depends only on the input multiset and
/// [`effective_max_origins`], not on insertion order.
///
@ -774,7 +774,7 @@ pub(crate) fn push_origin_bounded(
) -> bool {
// Identity check: same node counts as the same origin. We keep
// node-only dedup to match [`ssa_vars_leq`], which compares origin
// sets by node membership widening dedup here without tightening
// sets by node membership, widening dedup here without tightening
// there would break the monotonicity invariant.
if target.iter().any(|o| o.node == new.node) {
return true;
@ -814,7 +814,7 @@ pub(crate) fn push_origin_bounded(
target.insert(pos, new);
true
} else {
// `new` itself is the worst drop it instead of the survivor.
// `new` itself is the worst, drop it instead of the survivor.
false
}
}
@ -829,7 +829,7 @@ pub(super) fn merge_origins(
a: &SmallVec<[TaintOrigin; 2]>,
b: &SmallVec<[TaintOrigin; 2]>,
) -> SmallVec<[TaintOrigin; 2]> {
// Seed the result with `a` but re-sort defensively in case the
// Seed the result with `a`, but re-sort defensively in case the
// caller constructed `a` through non-bounded paths. Historically
// every write goes through `push_origin_bounded` (or `merge_origins`
// itself), so this resort is a no-op on the steady state but costs
@ -911,7 +911,7 @@ pub(super) fn merge_join_ssa_predicates(
mod origin_cap_tests {
//! Tests for the deterministic, config-driven origin cap. These
//! cover the behavior at the `push_origin_bounded` / `merge_origins`
//! boundary the end-to-end engine-note signal is exercised in
//! boundary, the end-to-end engine-note signal is exercised in
//! `tests/engine_notes_tests.rs`.
use super::*;
@ -1037,7 +1037,7 @@ mod origin_cap_tests {
fn effective_cap_reads_runtime_config_when_override_zero() {
// Override takes priority; override=0 falls through to config.
// `current()` returns the default (32) when no runtime is
// installed which is the state the rest of the test suite runs
// installed, which is the state the rest of the test suite runs
// under. Guard that the fallback path reaches 32.
let _g = TEST_GUARD.lock().unwrap_or_else(|e| e.into_inner());
set_max_origins_override(0);
@ -1053,7 +1053,7 @@ mod origin_cap_tests {
#[cfg(test)]
mod field_taint_tests {
//! Pointer-Phase 3: tests for the heap-field taint cells on
//!: tests for the heap-field taint cells on
//! [`SsaTaintState`]. Cover get/add round-trip, lattice join
//! (cap union + origin merge), and `leq` convergence semantics.
use super::*;
@ -1202,7 +1202,7 @@ mod field_taint_tests {
assert!(cell.validated_must, "a.must AND b.must = true");
assert!(cell.validated_may);
// Now make `b`'s validated_must false must should drop to
// Now make `b`'s validated_must false, must should drop to
// false on the join, may stays at OR.
let mut c = SsaTaintState::initial();
c.add_field(k, taint(Cap::ENV_VAR), false, true);
@ -1213,7 +1213,7 @@ mod field_taint_tests {
}
/// W4 audit: `merge_join_field_taint` OR-unions `validated_may`
/// any path's may-validation contributes to the joined cell.
///, any path's may-validation contributes to the joined cell.
#[test]
fn lattice_validated_may_unions_on_join() {
let k = key(1, 7);
@ -1275,7 +1275,7 @@ mod field_taint_tests {
a.leq(&b),
"must super-state and equal caps: a ≤ b should hold"
);
// Reverse: b.must=false, a.must=true for b ≤ a, we need
// Reverse: b.must=false, a.must=true, for b ≤ a, we need
// b.must ⊇ a.must which is false ⊇ true = false. So b ≤ a
// must fail.
assert!(!b.leq(&a), "b lacks the must invariant a holds");
@ -1289,7 +1289,7 @@ mod field_taint_tests {
assert!(!a2.leq(&b2), "a.may=true is NOT ⊆ b.may=false");
}
/// Pointer-Phase 3 / A8 audit: the field_taint lattice is monotone
/// the field_taint lattice is monotone
/// and converges under a deterministic enumeration of inputs.
/// Caps grow (OR), `uses_summary` grows (OR), origins grow modulo
/// the cap (merge_origins is bounded). Joins must:
@ -1409,7 +1409,7 @@ mod field_taint_tests {
/// `field_taint_leq` is the soundness gate for worklist
/// convergence: once `next ≤ acc`, the worklist halts. Pin that
/// `leq` is consistent with `join` i.e. `s.leq(s.join(t))` holds
/// `leq` is consistent with `join`, i.e. `s.leq(s.join(t))` holds
/// for any `s, t`. Without this, the worklist could loop
/// indefinitely on inputs whose join produces a state not
/// dominated by both inputs.

View file

@ -1,11 +1,11 @@
//! SSA function-summary and container-flow extraction.
//!
//! Extracted from the monolithic `ssa_transfer.rs`. Contains:
//! * [`extract_ssa_func_summary`] runs per-parameter taint probes and
//! * [`extract_ssa_func_summary`], runs per-parameter taint probes and
//! synthesises an [`crate::summary::ssa_summary::SsaFuncSummary`] with
//! source caps, return transforms, per-path transforms, and sink site
//! attribution.
//! * [`extract_container_flow_summary`] structural scan for
//! * [`extract_container_flow_summary`], structural scan for
//! `param_container_to_return` + `param_to_container_store` pairs.
//! * Private helpers for predicate-hash summarisation, abstract-transfer
//! derivation, callback source detection, and return-type inference.
@ -123,15 +123,15 @@ pub fn extract_ssa_func_summary_full(
.collect();
// Collect all param SSA values to exclude from return cap collection.
// Param values persist with their seeded taint throughout the function
// Param values persist with their seeded taint throughout the function ,
// we only want caps on derived values (call results, assigns) at return.
let all_param_values: std::collections::HashSet<SsaValue> =
param_info.iter().map(|(_, _, v)| *v).collect();
// Per-return-block observation captured alongside the aggregate return
// caps. Each entry records one return block's exit state caps
// caps. Each entry records one return block's exit state, caps
// contributed on that path, path-predicate hash, known_true/false bits,
// and the return SSA value's abstract fact so the per-param loop can
// and the return SSA value's abstract fact, so the per-param loop can
// emit one [`ReturnPathTransform`] per distinct predicate gate.
struct ReturnBlockObs {
/// Caps at the return SSA value (or joined live values for
@ -141,7 +141,7 @@ pub fn extract_ssa_func_summary_full(
/// (passthrough fallback).
param_caps: Cap,
/// Deterministic hash of the predicate gate at this return.
/// `0` means "no predicate gate" an unguarded return.
/// `0` means "no predicate gate", an unguarded return.
predicate_hash: u64,
/// `PredicateSummary::known_true` bits intersected across all
/// tracked variables at this return. Encoded via
@ -268,7 +268,7 @@ pub fn extract_ssa_func_summary_full(
}
}
} else {
// Return(None): implicit return fall back to all live values.
// Return(None): implicit return, fall back to all live values.
for (val, taint) in &exit.values {
if all_param_values.contains(val) {
block_param_caps |= taint.caps;
@ -348,7 +348,7 @@ pub fn extract_ssa_func_summary_full(
// Per-return-path PathFact decomposition derived from the baseline
// probe (no seeded taint). Abstract facts on the return rv are
// independent of taint seeding they describe the function's
// independent of taint seeding, they describe the function's
// intrinsic narrowing, so the baseline run captures them without
// per-param noise.
//
@ -388,7 +388,7 @@ pub fn extract_ssa_func_summary_full(
let mut param_to_sink: Vec<(usize, SmallVec<[SinkSite; 1]>)> = Vec::new();
let mut param_to_sink_param = Vec::new();
// Per-param return-path decomposition. Populated only when the param
// has ≥2 distinct return-block predicate hashes a single-return-path
// has ≥2 distinct return-block predicate hashes, a single-return-path
// callee is already precise via `param_to_return`.
let mut param_return_paths: Vec<(
usize,
@ -417,7 +417,7 @@ pub fn extract_ssa_func_summary_full(
// expressions (e.g. `file._source.uri`) as their own
// [`SsaOp::Param`] ops with composite `var_name`s like
// `"file._source.uri"`. These phantom Params are the values
// actually used as call arguments not the formal-param SSA
// actually used as call arguments, not the formal-param SSA
// value the seed targets. Without this, the per-param probe
// misses cross-call sinks because the call's arg SSA value is
// a phantom Param with no seed entry, so `transfer_inst::Param`
@ -447,7 +447,7 @@ pub fn extract_ssa_func_summary_full(
let (return_caps, events, _, per_return_obs) = run_probe(seed);
// Subtract baseline source_caps we only want param-contributed caps
// Subtract baseline source_caps, we only want param-contributed caps
let param_return_caps = return_caps & !source_caps;
if !param_return_caps.is_empty() {
@ -464,7 +464,7 @@ pub fn extract_ssa_func_summary_full(
// observed return block, derive a `ReturnPathTransform` mirroring
// the aggregate logic (prefer derived caps, fall back to param
// caps, strip baseline source caps). Only emit when ≥2 distinct
// predicate hashes are present a single-hash summary adds no
// predicate hashes are present, a single-hash summary adds no
// signal over the aggregate `param_to_return`.
if per_return_obs.len() >= 2 {
let mut per_path: SmallVec<[crate::summary::ssa_summary::ReturnPathTransform; 2]> =
@ -477,7 +477,7 @@ pub fn extract_ssa_func_summary_full(
};
let block_contributed = block_return_caps & !source_caps;
let transform_kind = if block_contributed.is_empty() {
// No caps on this path param does not reach return
// No caps on this path, param does not reach return
// under this predicate. A `StripBits(all)` records
// "all bits cleared" so downstream join preserves the
// disparity with other paths.
@ -513,9 +513,31 @@ pub fn extract_ssa_func_summary_full(
}
}
// Collect sink caps + primary-location sites from events + per-arg-position detail
// Collect sink caps + primary-location sites from events + per-arg-position detail.
//
// Skip events flagged `all_validated`: every tainted SSA value
// that reached the sink was already proved validated by a
// dominating predicate (AllowlistCheck / TypeCheck /
// ValidationCall, including the indirect-validator branch
// narrowing for `validate*` / `is_valid*` callees). Those
// events would have been dropped by `ssa_events_to_findings` at
// the per-file finding step; carrying them into
// `param_to_sink` / `param_to_sink_param` re-publishes a sink
// attribution callers can no longer suppress, because the
// caller can't see the validator that lives inside the
// callee body.
//
// Strict-additive: `all_validated` is set only when every
// tainted operand at the sink has its `var_name` in
// `state.validated_may`, single-path single-validator helpers
// cleanly skip; mixed-tainted-with-some-unvalidated events
// still propagate. Closes the helper-summary precision gap
// surfaced by Novu CVE GHSA-4x48-cgf9-q33f.
let mut param_sites: SmallVec<[SinkSite; 1]> = SmallVec::new();
for event in &events {
if event.all_validated {
continue;
}
for pos in extract_sink_arg_positions(event, ssa) {
param_to_sink_param.push((idx, pos, event.sink_caps));
}
@ -601,14 +623,14 @@ pub fn extract_ssa_func_summary_full(
// Per-parameter abstract-domain transfers.
//
// Derived structurally from the SSA body no additional taint probes.
// Derived structurally from the SSA body, no additional taint probes.
// Three-step inference per parameter:
// 1. Identity: return SSA value at every return block traces back to
// this parameter (possibly through assigns / phi merges all feeding
// from the same param).
// 2. Callee-intrinsic bound: baseline `return_abstract` carries a
// concrete fact (bounded interval or known prefix) that holds
// regardless of caller input record it once per parameter as
// regardless of caller input, record it once per parameter as
// `Clamped` / `LiteralPrefix` so the caller sees the bound even
// when it has no abstract info on its own argument.
// 3. Top: default; the entry is omitted (empty transfer is meaningless).
@ -630,14 +652,14 @@ pub fn extract_ssa_func_summary_full(
param_return_paths,
return_path_facts,
points_to,
// Pointer-Phase 5 extension — empty until the field-granularity
// extension, empty until the field-granularity
// extractor is wired (`NYX_POINTER_ANALYSIS=1` only). Default
// path stays bit-identical to today.
field_points_to: crate::summary::points_to::FieldPointsToSummary::empty(),
// Populated post-extraction in
// `taint::lower_all_functions_from_bodies` once SSA optimisation
// has computed `opt.type_facts`. Empty here means the
// extractor itself doesn't carry receiver-type info the
// extractor itself doesn't carry receiver-type info, the
// caller patches it in.
typed_call_receivers: Vec::new(),
}
@ -699,14 +721,14 @@ pub(super) fn summarise_return_predicates(state: &SsaTaintState) -> (u64, u8, u8
///
/// `return_abstract` is the callee's intrinsic baseline (from the no-seed
/// probe). When present, it describes a fact that holds for the return
/// regardless of parameter input so it can be attached as a
/// regardless of parameter input, so it can be attached as a
/// `Clamped` / `LiteralPrefix` transform to every parameter that flows to
/// the return.
///
/// Identity detection is structural: walk the return values back through
/// [`SsaOp::Assign`] / [`SsaOp::Phi`] chains (bounded) and check whether
/// every leaf resolves to the same [`SsaOp::Param`]. The trace is cheap
/// and can only produce `Identity` for passthrough callees anything
/// and can only produce `Identity` for passthrough callees, anything
/// more complex degrades to the baseline fact or `Top`.
fn derive_abstract_transfer(
ssa: &SsaBody,
@ -780,7 +802,7 @@ fn derive_abstract_transfer(
}
// Derive a baseline-invariant transform from `return_abstract`. This is
// the "callee intrinsic" fact that always holds each parameter that
// the "callee intrinsic" fact that always holds, each parameter that
// flows to the return gets it attached as the conservative transfer.
let baseline_invariant: Option<AbstractTransfer> = return_abstract.map(|av| {
let interval = match (av.interval.lo, av.interval.hi) {
@ -805,7 +827,7 @@ fn derive_abstract_transfer(
} else if let Some(base) = baseline_invariant.as_ref() {
// Baseline intrinsic bound applies to every parameter that could
// reach the return. We conservatively attach it to all params
// at apply time the caller meets it with the real return
//, at apply time the caller meets it with the real return
// abstract (also from this same summary), so double-counting
// would collapse to the tighter of the two.
transfer = base.clone();
@ -879,7 +901,7 @@ fn infer_summary_return_type(
lang: Lang,
) -> Option<crate::ssa::type_facts::TypeKind> {
// Find blocks with Return terminators, then look at the last defined value
// in those blocks if it's a Call with a known constructor, that's our type.
// in those blocks, if it's a Call with a known constructor, that's our type.
for block in &ssa.blocks {
if !matches!(block.terminator, Terminator::Return(_)) {
continue;
@ -965,7 +987,7 @@ pub(crate) fn extract_container_flow_summary(
// `trace_to_param` will happily return any `SsaOp::Param { index }`, but
// scoped lowering synthesises `Param` ops for external captures (module
// imports, free identifiers) at indices beyond the formal parameter count.
// Those must not enter the summary the key's arity only covers formal
// Those must not enter the summary, the key's arity only covers formal
// params, and an out-of-range index trips `ssa_summary_fits_arity`, forcing
// the reconciliation probe to generate a synthetic disambiguator that no
// caller will ever look up.
@ -1035,7 +1057,7 @@ pub(crate) fn extract_container_flow_summary(
};
// Trace container to positional param (SelfParam → None, so
// when the container is the receiver we skip the caller
// when the container is the receiver we skip, the caller
// tracks that via `receiver_to_container_store` if needed).
// Same arity filter as above: reject synthetic Param ops that
// were injected for free captures.

View file

@ -221,7 +221,7 @@ mod cross_file_tests {
mod inline_cache_epoch_tests {
//! Hooks for cross-file SCC joint fixed-point iteration.
//!
//! These do not exercise the full inline pipeline they lock down the
//! These do not exercise the full inline pipeline, they lock down the
//! semantic contract of [`inline_cache_clear_epoch`] and
//! [`inline_cache_fingerprint`] so the SCC orchestrator can rely on:
//!
@ -229,7 +229,7 @@ mod inline_cache_epoch_tests {
//! * `fingerprint` is deterministic across equivalent caches (same
//! keys → same bytes). Two caches with identical entries produce
//! identical fingerprints regardless of insertion order.
//! * `fingerprint` changes when return caps change the signal the
//! * `fingerprint` changes when return caps change, the signal the
//! orchestrator will use to detect inline-cache convergence.
use super::super::*;
@ -675,7 +675,7 @@ mod worklist_tests {
#[test]
fn dense_successors_no_duplicates() {
// Many successors, some repeated old O(n) contains() would be slow here
// Many successors, some repeated, old O(n) contains() would be slow here
let mut wl = VecDeque::new();
let mut in_wl = HashSet::new();
@ -735,8 +735,8 @@ mod primary_sink_location_tests {
//! [`SsaTaintEvent::primary_sink_site`] →
//! [`crate::taint::Finding::primary_location`].
//!
//! The test is deliberately low-level it wires up synthetic SSA and
//! drives the three emission stages directly so any future refactor
//! The test is deliberately low-level, it wires up synthetic SSA and
//! drives the three emission stages directly, so any future refactor
//! that drops the site on the floor between stages fails here rather
//! than only at the corpus/benchmark layer.
use super::super::*;
@ -841,7 +841,7 @@ mod primary_sink_location_tests {
/// If this fails, something on the summary→event→finding path
/// (`pick_primary_sink_sites`, `emit_ssa_taint_events`, or
/// `ssa_events_to_findings`) has silently stopped forwarding
/// coordinates. Fixing that path — not this test — is the right
/// coordinates. Fixing that path, not this test, is the right
/// response.
#[test]
fn ssa_summary_sinksite_surfaces_as_finding_primary_location() {
@ -863,7 +863,7 @@ mod primary_sink_location_tests {
};
// Drive the three emission stages with the summary's own
// `param_to_sink` that is what summary resolution feeds in the
// `param_to_sink`, that is what summary resolution feeds in the
// real pipeline.
let tainted: Vec<(SsaValue, Cap, SmallVec<[TaintOrigin; 2]>)> = vec![(
SsaValue(0),
@ -944,7 +944,7 @@ mod goto_succ_propagation_tests {
#[test]
fn goto_propagates_to_every_succ_on_three_way_collapse() {
// Build a block with Terminator::Goto(1) but succs = [1, 2, 3] the
// Build a block with Terminator::Goto(1) but succs = [1, 2, 3], the
// shape lowering emits for a 3-way fanout.
let block = SsaBlock {
id: BlockId(0),
@ -1001,7 +1001,7 @@ mod goto_succ_propagation_tests {
pointer_facts: None,
};
// A non-bottom exit state the test only cares that *every* succ
// A non-bottom exit state, the test only cares that *every* succ
// receives a clone of it, so any distinguishable state works.
let mut exit_state = SsaTaintState::initial();
exit_state.values.push((
@ -1259,7 +1259,7 @@ mod goto_succ_propagation_tests {
fn is_path_safe_for_sink_unknown_axis_returns_false() {
use crate::abstract_interp::PathFact;
// Only dotdot is cleared absolute stays Maybe → not path-safe.
// Only dotdot is cleared, absolute stays Maybe → not path-safe.
let half_fact = PathFact::default().with_dotdot_cleared();
assert!(!half_fact.is_path_safe());
}
@ -1328,9 +1328,9 @@ mod goto_succ_propagation_tests {
}
}
// ── Phase 4.2: receiver_candidates_for_type_lookup walks FieldProj ──────
// ── receiver_candidates_for_type_lookup walks FieldProj ──────
//
// After Phase 2 SSA decomposition, `c.client.send(req)` lowers to
// After SSA decomposition, `c.client.send(req)` lowers to
// v_c = Param("c", 0)
// v_client = FieldProj(v_c, "client")
// v_call = Call("send", receiver: v_client, args: [v_req])
@ -1430,7 +1430,7 @@ mod receiver_candidates_field_proj_tests {
fn field_proj_receiver_walks_to_typed_root_in_go() {
// Go is not Rust, so pre-Phase-4 the candidate walk would have
// returned ONLY the immediate receiver (v2 = FieldProj). With
// Phase 4 we walk through FieldProj.receiver to recover v0 (the
// We walk through FieldProj.receiver to recover v0 (the
// typed root `c`).
let body = body_with_field_proj_chain();
let cands =
@ -1516,7 +1516,7 @@ mod receiver_candidates_field_proj_tests {
}
}
// ── Phase 6 hierarchy fan-out: ResolvedSummary union semantics ──────────
// ── Hierarchy: ResolvedSummary union semantics ──────────
//
// `merge_resolved_summaries_fanout` is invoked at virtual-dispatch call
// sites where the receiver's static type has multiple concrete
@ -1553,7 +1553,7 @@ mod fanout_merge_tests {
}
}
/// B1 caps that grow taint signal (source/sink/receiver_to_sink)
/// B1, caps that grow taint signal (source/sink/receiver_to_sink)
/// are unioned. sanitizer_caps are intersected so only bits
/// stripped by EVERY implementer count as cleared at the call site.
#[test]
@ -1581,7 +1581,7 @@ mod fanout_merge_tests {
);
}
/// B2 propagates_taint is OR'd; propagating_params is the union
/// B2, propagates_taint is OR'd; propagating_params is the union
/// (any implementer's propagator counts).
#[test]
fn merge_propagation_unions() {
@ -1600,7 +1600,7 @@ mod fanout_merge_tests {
assert_eq!(params, vec![0, 1, 2]);
}
/// B3 param_to_sink merges per-parameter caps (OR). An impl
/// B3, param_to_sink merges per-parameter caps (OR). An impl
/// that adds a sink at param N composes with another impl that
/// adds a different cap at the same N.
#[test]
@ -1630,7 +1630,7 @@ mod fanout_merge_tests {
);
}
/// B4 param_to_sink_sites merges per-parameter site lists with
/// B4, param_to_sink_sites merges per-parameter site lists with
/// PartialEq dedup. The same site appearing in both impls (e.g.
/// inherited definition) must not be reported twice.
#[test]
@ -1675,7 +1675,7 @@ mod fanout_merge_tests {
assert!(sites.iter().any(|s| s == &unique_b));
}
/// B5 SSA-precision fields are dropped on disagreement. Two
/// B5, SSA-precision fields are dropped on disagreement. Two
/// summaries with different `return_type` collapse to None;
/// agreement is preserved.
#[test]
@ -1704,7 +1704,7 @@ mod fanout_merge_tests {
);
}
/// B6 abstract_transfer + param_return_paths drop on
/// B6, abstract_transfer + param_return_paths drop on
/// disagreement (precise predicate-path data is not safely
/// composable across distinct function bodies).
#[test]
@ -1737,7 +1737,7 @@ mod fanout_merge_tests {
);
}
/// B7 empty + empty = empty (no panic on degenerate inputs).
/// B7, empty + empty = empty (no panic on degenerate inputs).
#[test]
fn merge_empties_is_identity() {
let m = merge_resolved_summaries_fanout(empty(), empty());
@ -1748,7 +1748,7 @@ mod fanout_merge_tests {
}
}
// ── Pointer-Phase 3 / W1: synthetic field-WRITE round-trip ──────────────
//── synthetic field-WRITE round-trip ──────────────
//
// SSA lowering populates `SsaBody.field_writes` with entries that lift a
// synthetic base-update Assign (`obj.f = rhs`) into a structural field
@ -1918,8 +1918,8 @@ mod field_write_tests {
crate::pointer::analyse_body(body, crate::cfg::BodyId(7))
}
/// Reuse `make_cfg`'s nodes the body's instructions all reference
/// them so `transfer_inst` can index `cfg[cfg_node]`.
/// Reuse `make_cfg`'s nodes, the body's instructions all reference
/// them, so `transfer_inst` can index `cfg[cfg_node]`.
fn drive(body: &SsaBody, pf: &PointsToFacts) -> SsaTaintState {
// We need a CFG that contains the bodies' cfg_nodes.
let (cfg, _, _, _, _) = make_cfg();
@ -1998,7 +1998,7 @@ mod field_write_tests {
/// Pointer-disabled run (`pointer_facts: None`): no field cell is
/// recorded, no taint flows through the `obj.cache` projection. The
/// strict-additive contract pointer-disabled behaviour is the
/// strict-additive contract, pointer-disabled behaviour is the
/// pre-W1 baseline.
#[test]
fn pointer_disabled_run_produces_no_field_taint() {
@ -2047,8 +2047,8 @@ mod field_write_tests {
state.field_taint.is_empty(),
"pointer-disabled run must not populate field_taint",
);
// FieldProj reads still produce the receiver's existing taint
// none so no entry for SsaValue(3) either.
// FieldProj reads still produce the receiver's existing taint ,
// none, so no entry for SsaValue(3) either.
assert!(state.get(SsaValue(3)).is_none());
let _ = cache_id;
}
@ -2059,7 +2059,7 @@ mod field_write_tests {
/// projected value's symbol-level `validated_must` from the cell.
///
/// This is the key invariant: validation flows *through* abstract
/// field identity the read recovers what the write recorded.
/// field identity, the read recovers what the write recorded.
#[test]
fn write_then_read_preserves_validated_must() {
let (body, cache_id) = make_body();
@ -2208,7 +2208,7 @@ mod field_write_tests {
},
};
let pf = crate::pointer::analyse_body(&body, crate::cfg::BodyId(0));
// v0 is Const → empty pt the hook should not insert anything.
// v0 is Const → empty pt, the hook should not insert anything.
assert!(
pf.pt(SsaValue(0)).is_empty(),
"Const value should have empty pt set",
@ -2259,7 +2259,7 @@ mod field_write_tests {
}
}
// ── Pointer-Phase 4 / W2: container ELEM write/read round-trip ──────────
//── container ELEM write/read round-trip ──────────
//
// Container methods like `arr.push(v)` / `arr.shift()` flow per-element
// taint through the `Field(_, ELEM)` cells on `SsaTaintState`. These
@ -2351,7 +2351,7 @@ mod container_elem_tests {
state
}
/// `arr.push(source()); arr.shift()` the read picks the source's
/// `arr.push(source()); arr.shift()`, the read picks the source's
/// caps up via the ELEM cell.
#[test]
fn container_write_then_read_round_trips_taint() {
@ -2456,7 +2456,7 @@ mod container_elem_tests {
);
// Drive the transfer. `e := arr.shift()` goes through the
// existing Call arm the W2 path is the *write* on `push`.
// existing Call arm, the W2 path is the *write* on `push`.
// The element-read side already exists on `analyse_body`; the
// taint engine doesn't yet read field cells through call-result
// paths (Call args are walked by Call's own argument-taint
@ -2482,7 +2482,7 @@ mod container_elem_tests {
}
}
/// W4: `arr.push(validate(src)); arr.shift()` the push records
/// W4: `arr.push(validate(src)); arr.shift()`, the push records
/// `validated_must = true` on the ELEM cell because the pushed
/// value's symbol carried `validated_must`. The shift call result
/// reads through the cell and seeds the result symbol's
@ -2761,7 +2761,7 @@ mod container_elem_tests {
}
}
// ── Pointer-Phase 5 / W3: cross-call field-points-to application ────────
//── cross-call field-points-to application ────────
//
// `apply_field_points_to_writes` is the resolver-side hook that turns
// callee-summary `field_points_to.param_field_writes` into caller-side
@ -2783,7 +2783,7 @@ mod cross_call_field_tests {
use smallvec::smallvec;
use std::collections::HashMap;
/// W3 / W4: shared empty interner these unit tests don't seed
/// W3 / W4: shared empty interner, these unit tests don't seed
/// validation bits, so a fresh interner is sufficient for the
/// `interner` parameter on `apply_field_points_to_writes`.
fn empty_interner() -> SymbolInterner {
@ -2861,23 +2861,23 @@ mod cross_call_field_tests {
state
}
/// Callee summary with `param_field_writes[(0, ["cache"])]`
/// Callee summary with `param_field_writes[(0, ["cache"])]` ,
/// "callee writes cache field on parameter 0 (obj)".
/// Caller passes `(obj, source)` to this callee `arg 0 = obj`,
/// Caller passes `(obj, source)` to this callee, `arg 0 = obj`,
/// but the W3 hook resolves the *value at arg position 0* as the
/// receiver of the field write, populating its pt's cells.
///
/// We model the caller as `callee(obj, source)` with arg 0 = obj
/// (the receiver) and arg 1 = source (the value being written).
/// The callee's signature is `fn store(obj, value) { obj.cache = value; }`
/// so the field write on param 0 is keyed by `pt(obj)` and the
///, so the field write on param 0 is keyed by `pt(obj)` and the
/// taint comes from arg 1's caps. Our helper conservatively unions
/// every arg's taint into the cell which over-tints (for this
/// every arg's taint into the cell, which over-tints (for this
/// shape, arg 0's pt member becomes the loc, with arg 0's own taint
/// applied), but is sound.
///
/// To make the test precise, we model the simpler shape `fn store(obj)
/// { obj.cache = source(); }` callee writes a literal source into
/// { obj.cache = source(); }`, callee writes a literal source into
/// `obj.cache`, with no value parameter. Then the caller-side hook
/// only sees param 0's taint (zero), so the cell is empty and the
/// test fails.
@ -2886,7 +2886,7 @@ mod cross_call_field_tests {
/// at the call site arg 0 carries source taint. The hook then
/// records (pt(arg0_value), cache) ← arg0_value's taint. In a
/// real callee this corresponds to "callee writes its parameter
/// value into a self.cache field internally" but the spread we
/// value into a self.cache field internally", but the spread we
/// validate is just substitute-and-mirror.
#[test]
fn cross_call_writes_into_param_field_cell() {
@ -2947,7 +2947,7 @@ mod cross_call_field_tests {
fn cross_call_receiver_field_uses_max_sentinel() {
let (body, cache_id, pf) = caller_body();
let mut state = SsaTaintState::initial();
// Seed receiver with taint SsaValue(0) is the param/receiver.
// Seed receiver with taint, SsaValue(0) is the param/receiver.
state.set(
SsaValue(0),
VarTaint {
@ -3026,7 +3026,7 @@ mod cross_call_field_tests {
);
}
/// Field names the caller never interned are skipped silently
/// Field names the caller never interned are skipped silently ,
/// no FieldProj read in the caller could observe such a cell.
#[test]
fn cross_call_unknown_field_name_skipped() {
@ -3062,7 +3062,7 @@ mod cross_call_field_tests {
);
}
/// Overflow summary is treated conservatively as no-op the
/// Overflow summary is treated conservatively as no-op, the
/// engine cannot soundly cell-flood, so it skips entirely.
#[test]
fn cross_call_overflow_summary_is_noop() {
@ -3117,7 +3117,7 @@ mod cross_call_field_tests {
//
// `SsaTaintState.add_field` already routes through `merge_origins`, but
// the FieldProj READ path used to walk the cell's origins inline,
// deduping by node only meaning a cell with N>cap origins surfaced
// deduping by node only, meaning a cell with N>cap origins surfaced
// all N to the projected SSA value. After A7, the read path uses
// `push_origin_bounded`, ensuring the cap-driven survivor selection
// applies on read too.
@ -3225,7 +3225,7 @@ mod field_taint_origin_cap_tests {
let (body, cache_id, cfg, _n_proj) = build_body();
let pf = crate::pointer::analyse_body(&body, crate::cfg::BodyId(0));
// Pre-populate the (Param, cache) cell with 4 origins
// Pre-populate the (Param, cache) cell with 4 origins ,
// 2× the cap. The `add_field` path already truncates via
// `merge_origins`, so we go through it 4 times to grow.
let mut state = SsaTaintState::initial();
@ -3326,14 +3326,14 @@ mod field_taint_origin_cap_tests {
// the field_taint cells.
//
// Two scenarios:
// 1. `must_validated_flows_through_join` both predecessor blocks
// 1. `must_validated_flows_through_join`, both predecessor blocks
// write the cell with `validated_must = true`. After the join, the
// cell at the read site retains `validated_must = true` (AND
// intersection of two `true`s).
// 2. `early_exit_branch_drops_validated_must` only one predecessor
// 2. `early_exit_branch_drops_validated_must`, only one predecessor
// writes; the other reaches the read block via an empty branch.
// After the join, the cell has `validated_must = false`,
// `validated_may = true` W4's must/may intersection in action.
// `validated_may = true`, W4's must/may intersection in action.
#[cfg(test)]
mod pointer_lattice_worklist_tests {
use super::super::*;
@ -3425,7 +3425,7 @@ mod pointer_lattice_worklist_tests {
succs: smallvec![BlockId(1), BlockId(2)],
};
// Block 1: synth `obj.cache = src` field_writes[v2] = (v0, cache_id)
// Block 1: synth `obj.cache = src`, field_writes[v2] = (v0, cache_id)
let block1 = SsaBlock {
id: BlockId(1),
phis: vec![],
@ -3441,7 +3441,7 @@ mod pointer_lattice_worklist_tests {
succs: smallvec![BlockId(3)],
};
// Block 2: identical synth write keeps both branches
// Block 2: identical synth write, keeps both branches
// contributing the same cell so AND-intersection of must
// preserves true on the join.
let block2 = SsaBlock {
@ -3459,7 +3459,7 @@ mod pointer_lattice_worklist_tests {
succs: smallvec![BlockId(3)],
};
// Block 3: read FieldProj uses obj from a phi between B1 and B2.
// Block 3: read, FieldProj uses obj from a phi between B1 and B2.
let block3 = SsaBlock {
id: BlockId(3),
phis: vec![SsaInst {
@ -3634,7 +3634,7 @@ mod pointer_lattice_worklist_tests {
);
}
/// A2.b: early-exit branch only B1 writes, B2 reaches B3 via
/// A2.b: early-exit branch, only B1 writes, B2 reaches B3 via
/// an empty body. After the join, the cell exists (B1 wrote
/// it), but `validated_must` is `false` (B2 didn't write, the
/// orphan-side merge clears `must` per the W4 lattice rule);
@ -3642,7 +3642,7 @@ mod pointer_lattice_worklist_tests {
///
/// To exercise the validation channels we synthesise the cell
/// directly at the appropriate exit state, then run the
/// worklist's join via two `SsaTaintState::join()` calls the
/// worklist's join via two `SsaTaintState::join()` calls, the
/// body's worklist itself doesn't seed `validated_must` on the
/// rhs of an Assign, so we model the "writer recorded must=true"
/// scenario at the lattice level rather than driving it through

View file

@ -698,7 +698,7 @@ fn cross_file_sink_finding_carries_primary_location() {
);
let finding = &findings[0];
// Note: `uses_summary == false` here because the source (env::var) is
// local only the *sink* was summary-resolved. That's the case the
// local, only the *sink* was summary-resolved. That's the case the
// `primary_location` / `uses_summary` independence comment on
// [`super::Finding::primary_location`] documents.
let loc = finding
@ -925,7 +925,7 @@ fn multi_file_sink_in_another_file() {
}
"#;
// File B: env::var → exec_cmd() sink is cross-file.
// File B: env::var → exec_cmd(), sink is cross-file.
let caller_src = br#"
use std::env;
fn main() {
@ -956,7 +956,7 @@ fn multi_file_sink_in_another_file() {
fn multi_file_passthrough_preserves_taint() {
use crate::summary::FuncSummary;
// identity() just returns its argument it propagates taint but has no
// identity() just returns its argument, it propagates taint but has no
// source/sanitizer/sink caps of its own.
let mut global = GlobalSummaries::new();
let key = FuncKey {
@ -1071,7 +1071,7 @@ fn multi_file_chain_source_sanitize_sink_across_files() {
fn sanitizer_strips_only_matching_bits() {
// Source(ALL) → shell_escape → sink_html (HTML sink).
// shell_escape strips SHELL_ESCAPE but not HTML_ESCAPE.
// sink_html is an HTML sink HTML_ESCAPE bit is still set → 1 finding.
// sink_html is an HTML sink, HTML_ESCAPE bit is still set → 1 finding.
let src = br#"
use std::env;
fn sink_html(s: &str) {}
@ -1142,7 +1142,7 @@ fn taint_through_variable_reassignment() {
#[test]
fn untainted_variable_at_sink_is_safe() {
// A string literal (not from a source) passed to Command no finding.
// A string literal (not from a source) passed to Command, no finding.
let src = br#"
use std::process::Command;
fn main() {
@ -1585,7 +1585,7 @@ fn cpp_source_to_sink() {
);
}
/// Phase 2 (cpp-precision): `c_str()` is a const accessor on `std::string`
/// `c_str()` is a const accessor on `std::string`
/// that returns a pointer to the same buffer. It must propagate taint from
/// the receiver to the result so the downstream sink fires.
#[test]
@ -1597,12 +1597,12 @@ fn cpp_c_str_propagates_taint() {
let findings = analyse_file(&file_cfg, summaries, None, Lang::Cpp, "test.cpp", &[], None);
assert!(
!findings.is_empty(),
"C++: tainted s.c_str() into system() must fire (Phase 2 c_str passthrough)",
"C++: tainted s.c_str() into system() must fire",
);
}
/// Phase 2: `std::move(x)` returns its argument unchanged in terms of
/// data flow the rvalue cast is a representation move, not a sanitiser.
/// `std::move(x)` returns its argument unchanged in terms of
/// data flow, the rvalue cast is a representation move, not a sanitiser.
/// Default propagation collects argument taint into the result.
#[test]
fn cpp_std_move_propagates_taint() {
@ -1617,7 +1617,7 @@ fn cpp_std_move_propagates_taint() {
);
}
/// Phase 2: `static_cast<T>(x)` is parsed as a call expression by
/// `static_cast<T>(x)` is parsed as a call expression by
/// tree-sitter-cpp; default propagation transports taint from the casted
/// argument to the result.
#[test]
@ -1633,7 +1633,7 @@ fn cpp_static_cast_propagates_taint() {
);
}
/// Phase 5 (cpp-precision): a fluent builder chain whose host
/// a fluent builder chain whose host
/// argument is tainted should fire on the terminal `.connect()`
/// SSRF sink. The chained `.host(...)` / `.port(...)` calls return
/// the receiver, and default Call-arg propagation puts the tainted
@ -1647,12 +1647,12 @@ fn cpp_builder_chain_user_host_fires() {
let findings = analyse_file(&file_cfg, summaries, None, Lang::Cpp, "test.cpp", &[], None);
assert!(
!findings.is_empty(),
"C++: tainted host through fluent builder chain must reach terminal connect() (Phase 5)",
"C++: tainted host through fluent builder chain must reach terminal connect()",
);
}
/// Phase 5: a fluent builder chain with a hardcoded host literal
/// must NOT fire on the terminal connect() sink the chain carries
/// a fluent builder chain with a hardcoded host literal
/// must NOT fire on the terminal connect() sink, the chain carries
/// no taint.
#[test]
fn cpp_builder_chain_const_host_silent() {
@ -1663,11 +1663,11 @@ fn cpp_builder_chain_const_host_silent() {
let findings = analyse_file(&file_cfg, summaries, None, Lang::Cpp, "test.cpp", &[], None);
assert!(
findings.is_empty(),
"C++: builder chain with literal host must NOT fire (Phase 5 negative)",
"C++: builder chain with literal host must NOT fire (Negative)",
);
}
/// Phase 4 (cpp-precision): inline member-function bodies inside a
/// inline member-function bodies inside a
/// `class_specifier` must be extracted as separate functions and
/// intra-file calls must resolve to their bodies. Pre-Phase-4, the
/// `class_specifier` AST kind was unmapped in cpp KINDS, so the CFG
@ -1682,11 +1682,11 @@ fn cpp_inline_class_method_resolves() {
let findings = analyse_file(&file_cfg, summaries, None, Lang::Cpp, "test.cpp", &[], None);
assert!(
!findings.is_empty(),
"C++: tainted arg through inline class method must reach system() (Phase 4)",
"C++: tainted arg through inline class method must reach system()",
);
}
/// Phase 3 (cpp-precision): a tainted argument passed through an
/// a tainted argument passed through an
/// identity-style lambda (`auto echo = [](const char* s) { return s; }`)
/// must reach the downstream sink. This is handled by the same default
/// Call-arg propagation as `std::move`/`static_cast`; pinning the
@ -1705,7 +1705,7 @@ fn cpp_identity_lambda_propagates_taint() {
);
}
/// Phase 2: `std::vector<char>::data()` is a Load-style container op that
/// `std::vector<char>::data()` is a Load-style container op that
/// returns a pointer to the underlying buffer; `system(v.data())` should
/// fire when `v` is tainted.
#[test]
@ -1801,7 +1801,7 @@ fn ruby_source_to_sink() {
// ─────────────────────────────────────────────────────────────────────────────
//
// Cross-language resolution now requires explicit InteropEdge declarations.
// Without an edge, functions from different languages are never resolved
// Without an edge, functions from different languages are never resolved ,
// this prevents false positives from name collisions across languages.
/// Extract cross-file summaries from any language's source bytes.
@ -1984,7 +1984,7 @@ fn cross_lang_rust_sanitizer_in_js_via_interop() {
None,
);
// eval uses Cap::all(), so a SHELL_ESCAPE sanitizer alone does NOT
// neutralise taint shell-escape is semantically wrong for code injection.
// neutralise taint, shell-escape is semantically wrong for code injection.
// The finding should still be reported.
assert!(
!findings.is_empty(),
@ -2481,7 +2481,7 @@ fn cross_lang_summary_preserves_lang_metadata() {
let global = merge_summaries(vec![py_summary, js_summary], None);
// They are now separate entries not merged
// They are now separate entries, not merged
let py_matches = global.lookup_same_lang(Lang::Python, "helper");
let js_matches = global.lookup_same_lang(Lang::JavaScript, "helper");
@ -2609,7 +2609,7 @@ fn ambiguous_resolution_returns_none() {
);
}
// Caller from c.rs calls helper() ambiguous (two matches, neither is caller's namespace)
// Caller from c.rs calls helper(), ambiguous (two matches, neither is caller's namespace)
let src = br#"
use std::process::Command;
fn main() {
@ -2855,7 +2855,7 @@ fn validate_and_early_return() {
let summaries = &file_cfg.summaries;
let findings = analyse_file(&file_cfg, summaries, None, Lang::Rust, "test.rs", &[], None);
// Validated findings are now suppressed validate() guard means the
// Validated findings are now suppressed, validate() guard means the
// sink is on the safe path, so no finding should be emitted.
assert_eq!(findings.len(), 0, "validated finding should be suppressed");
}
@ -2888,7 +2888,7 @@ fn validate_in_if_else_path_validated() {
let summaries = &file_cfg.summaries;
let findings = analyse_file(&file_cfg, summaries, None, Lang::Rust, "test.rs", &[], None);
// Validated findings are now suppressed sink is in the validated
// Validated findings are now suppressed, sink is in the validated
// branch, so no finding should be emitted.
assert_eq!(findings.len(), 0, "validated finding should be suppressed");
}
@ -2932,7 +2932,7 @@ fn contradictory_null_check_pruned() {
// Inner branch is infeasible: if x.is_none() then x cannot also be is_none().
// After early return on is_none(), the fall-through path has polarity=false
// for NullCheck. The inner `if x.is_none()` True branch has polarity=true
// for NullCheck. The inner `if x.is_none()` True branch has polarity=true ,
// contradiction.
let src = br#"
use std::env; use std::process::Command;
@ -3045,7 +3045,7 @@ fn path_state_budget_graceful() {
let summaries = &file_cfg.summaries;
let findings = analyse_file(&file_cfg, summaries, None, Lang::Rust, "test.rs", &[], None);
// Should still detect the flow truncation shouldn't cause false negatives.
// Should still detect the flow, truncation shouldn't cause false negatives.
assert_eq!(
findings.len(),
1,
@ -3080,7 +3080,7 @@ fn unknown_predicate_not_pruned() {
let summaries = &file_cfg.summaries;
let findings = analyse_file(&file_cfg, summaries, None, Lang::Rust, "test.rs", &[], None);
// Comparison is not in the whitelist the path should NOT be pruned.
// Comparison is not in the whitelist, the path should NOT be pruned.
assert_eq!(
findings.len(),
1,
@ -3096,7 +3096,7 @@ fn duplicate_null_guard_prunes_unreachable_sink() {
// After `if y.is_none() { return; }`, the false arm proves
// `y.is_none() == false` on the only surviving path. A second
// `if y.is_none() { sink }` then adds `y.is_none() == true` on the
// body's True arm a per-symbol PredicateSummary contradiction
// body's True arm, a per-symbol PredicateSummary contradiction
// (known_true & known_false on bit NullCheck). The body is
// structurally unreachable; the sink must not fire.
//
@ -3573,7 +3573,7 @@ fn js_two_level_converges_no_mutation() {
#[test]
fn catch_param_to_sink_has_caught_exception_source_kind() {
// Catch param flows to a sink the finding source_kind must be
// Catch param flows to a sink, the finding source_kind must be
// CaughtException, not Unknown.
let src = b"
const { exec } = require('child_process');
@ -3743,7 +3743,7 @@ fn assert_ssa_integration(src: &[u8]) {
// High-level path (per-body analysis)
let high_level = analyse_file(&file_cfg, summaries, None, Lang::Rust, "test.rs", &[], None);
// Direct SSA path use the first function body (fn main), not top-level
// Direct SSA path, use the first function body (fn main), not top-level
let body = if file_cfg.bodies.len() > 1 {
&file_cfg.bodies[1]
} else {
@ -4654,7 +4654,7 @@ fn ssa_induction_var_no_taint() {
#[test]
fn ssa_loop_tainted_var_not_induction() {
// `x` is tainted and transformed in a loop NOT an induction variable
// `x` is tainted and transformed in a loop, NOT an induction variable
let src = br#"
use std::{env, process::Command};
fn main() {
@ -4766,7 +4766,7 @@ fn ssa_phi_path_sensitive_both_branches_validated() {
let summaries = &file_cfg.summaries;
let findings = analyse_file(&file_cfg, summaries, None, Lang::Rust, "test.rs", &[], None);
// Validated findings are now suppressed sink is in the validated
// Validated findings are now suppressed, sink is in the validated
// branch, so no finding should be emitted.
assert_eq!(findings.len(), 0, "validated finding should be suppressed");
}
@ -5116,7 +5116,7 @@ fn abstract_ssrf_prefix_linear_suppression() {
/// Two predecessor blocks produce string concat values with different safe
/// prefixes ("https://api.example.com/users/" and "https://api.example.com/admins/").
/// A phi merges them. The LCP of the prefixes is "https://api.example.com/" which
/// still has scheme://host/ so SSRF suppression should fire.
/// still has scheme://host/, so SSRF suppression should fire.
///
/// Before the phi replay fix, collect_block_events did NOT replay abstract phis,
/// leaving the phi result's abstract value as Top (stale). The SSRF suppression
@ -5255,7 +5255,7 @@ fn phi_validated_must_requires_all_paths() {
use tree_sitter::Language;
// Path A validates x, path B does NOT validate x.
// The phi for x after the merge must NOT get validated_must only
// The phi for x after the merge must NOT get validated_must, only
// validated_may (since at least one path validated). The sink after
// the merge must still fire because the must-analysis says "not
// definitely validated on all paths".
@ -5324,7 +5324,7 @@ fn inline_return_constant_with_internal_source_produces_no_finding() {
None,
);
// transform() returns a constant no taint should leak to caller
// transform() returns a constant, no taint should leak to caller
assert_eq!(
findings.len(),
0,
@ -5386,7 +5386,7 @@ fn inline_return_taint_internal_source_does_not_widen_caps() {
// Callee has an internal source (document.location) alongside a tainted
// param. The explicit return value is the param. Without the C-1 fix,
// extract_inline_return_taint would union ALL live tainted values' caps
// the internal source's derived-caps would override the param-caps
//, the internal source's derived-caps would override the param-caps
// (derived takes priority in the extraction logic). With the fix, only
// the return value's taint is collected, so param taint is returned
// correctly.
@ -5420,7 +5420,7 @@ fn inline_return_taint_internal_source_does_not_widen_caps() {
None,
);
// The callee returns cmd (tainted param) 1 finding expected.
// The callee returns cmd (tainted param), 1 finding expected.
// The internal document.location() should NOT widen the return taint.
assert_eq!(
findings.len(),
@ -5435,7 +5435,7 @@ fn inline_return_taint_internal_source_does_not_widen_caps() {
///
/// Two class methods share the leaf name `process` in the same file. If the
/// summary map were keyed by bare name (or raw file-path namespace), the
/// second lowering would overwrite the first both methods would end up
/// second lowering would overwrite the first, both methods would end up
/// pointing at whichever summary was extracted last.
///
/// With canonical `FuncKey` identity (`container` discriminates them) both
@ -5483,7 +5483,7 @@ class Worker {
summaries.keys().collect::<Vec<_>>(),
);
// Same invariant on the cached-bodies map inline analysis depends on
// Same invariant on the cached-bodies map, inline analysis depends on
// being able to fetch the correct body by full FuncKey.
let mut body_containers: Vec<String> = bodies
.iter()
@ -5593,6 +5593,7 @@ fn make_finding_for_link_test(
path_hash,
finding_id: String::new(),
alternative_finding_ids: smallvec::SmallVec::new(),
effective_sink_caps: crate::labels::Cap::empty(),
}
}
@ -5628,7 +5629,7 @@ fn finding_id_encodes_validation_and_path_hash() {
);
// Differing path_hash produces a different ID even with the same
// (body, source, sink, validated) the whole point of the path
// (body, source, sink, validated), the whole point of the path
// component in the dedup key.
let mut u2 = make_finding_for_link_test(1, 3, 7, 0xdead_beef_0000_0002, false);
u2.finding_id = super::make_finding_id(&u2);
@ -5639,7 +5640,7 @@ fn finding_id_encodes_validation_and_path_hash() {
}
/// `link_alternative_paths` must cross-link findings that share
/// `(body_id, sink, source)` so a validated flow and an unvalidated
/// `(body_id, sink, source)`, so a validated flow and an unvalidated
/// flow on the same source/sink pair each list the other's ID.
#[test]
fn link_alternative_paths_cross_references_same_body_sink_source() {
@ -5668,18 +5669,18 @@ fn link_alternative_paths_cross_references_same_body_sink_source() {
}
/// Findings that differ on `(body_id, sink, source)` are independent
/// vulnerabilities they must **not** end up cross-linked as
/// vulnerabilities, they must **not** end up cross-linked as
/// alternatives, otherwise the "alternative path" framing becomes
/// noise.
#[test]
fn link_alternative_paths_does_not_link_distinct_sink_source() {
let mut findings = vec![
make_finding_for_link_test(1, 3, 7, 0x1111, false),
// Different sink independent finding, not an alternative.
// Different sink, independent finding, not an alternative.
make_finding_for_link_test(1, 3, 8, 0x1111, false),
// Different source also independent.
// Different source, also independent.
make_finding_for_link_test(1, 4, 7, 0x1111, false),
// Different body also independent.
// Different body, also independent.
make_finding_for_link_test(2, 3, 7, 0x1111, false),
];
for f in &mut findings {
@ -5697,7 +5698,7 @@ fn link_alternative_paths_does_not_link_distinct_sink_source() {
/// When the same `(body, sink, source)` has three sibling findings
/// (e.g. validated, unvalidated-path-A, unvalidated-path-B), each
/// finding must list the other two the group is symmetric and
/// finding must list the other two, the group is symmetric and
/// complete rather than a chain.
#[test]
fn link_alternative_paths_three_way_group() {
@ -5726,14 +5727,14 @@ fn link_alternative_paths_three_way_group() {
}
// ─────────────────────────────────────────────────────────────────────────────
// Typed call-graph devirtualisation — Phase 2 (typed_call_receivers)
// Typed call-graph devirtualisation (typed_call_receivers)
// ─────────────────────────────────────────────────────────────────────────────
/// Phase 2: when a method call's receiver was constructed from a known
/// when a method call's receiver was constructed from a known
/// constructor (`File::open` → `FileHandle`), the SSA-extraction
/// pipeline must record `(call_ordinal, "FileHandle")` on the
/// caller's [`crate::summary::ssa_summary::SsaFuncSummary::typed_call_receivers`]
/// so Phase 3 can devirtualise the cross-file edge.
/// so build_call_graph can devirtualise the cross-file edge.
///
/// Uses Java because `FileInputStream` / `FileOutputStream` are part
/// of the [`crate::ssa::type_facts::constructor_type`] table for Java
@ -5779,14 +5780,14 @@ class Reader {
);
}
/// Phase 2 negative control: free-function calls (no receiver) must
/// Negative control: free-function calls (no receiver) must
/// never appear in `typed_call_receivers`. Even when the callee is a
/// known type-producing constructor, it sits in the body as a Call
/// with `receiver = None` and is not a candidate for devirtualisation.
#[test]
fn typed_call_receivers_skips_free_function_calls() {
// `new FileInputStream(...)` is a constructor invocation with no
// receiver exactly the shape we want to ignore.
// receiver, exactly the shape we want to ignore.
let src = br#"
class Maker {
void make() {
@ -5808,10 +5809,10 @@ class Maker {
// make() has zero parameters and no fresh-allocation return, so the
// generic insertion gate skips it. The phase-2 patch only force-
// inserts when `typed_call_receivers` is non-empty which it
// inserts when `typed_call_receivers` is non-empty, which it
// isn't here, since `new FileInputStream(...)` is a free-function-
// shaped constructor call (no SSA receiver). So either the
// summary is absent, or — if some other side effect inserted it —
// summary is absent, or, if some other side effect inserted it ,
// its `typed_call_receivers` is empty. Both forms prove no
// spurious typed entry was recorded.
let typed = summaries
@ -5829,7 +5830,7 @@ class Maker {
/// Regression: nested arrow functions inside `return new Promise((res,rej)
/// => { ... })` must be lifted as separate bodies. Before the Kind::Return
/// arm in cfg/mod.rs called `collect_nested_function_nodes`, only the
/// outer function (`downloadFromUri`) was extracted the executor and
/// outer function (`downloadFromUri`) was extracted, the executor and
/// its inner callbacks were silently swallowed, hiding the inner gated
/// http.get sink from classification. Motivated by CVE-2025-64430.
#[test]
@ -5972,7 +5973,7 @@ const handler = (req) => {
/// The augment pass populates `downloadFromUri.summary.param_to_sink:
/// [(0, SSRF)]` (single-hop closure-capture lift). For the handler's
/// `helper(req.body)` call to fire, `helper.summary.param_to_sink` must
/// also contain `[(0, SSRF)]` but that requires `helper`'s probe to
/// also contain `[(0, SSRF)]`, but that requires `helper`'s probe to
/// see `downloadFromUri`'s augmented summary at resolution time.
///
/// Because the probe currently runs with `ssa_summaries=None`,
@ -6065,11 +6066,198 @@ const handler = (req) => {
/// `middle.summary.param_to_sink`, then handler's call site picks it up.
///
/// Today the second-pass runs only once (no fixed-point), so depth-3+
/// is expected to NOT fire guards against accidental fixed-point
/// is expected to NOT fire, guards against accidental fixed-point
/// regression that would mask an over-eager rewrite. Marked
/// `#[ignore]` so it documents the depth limit without breaking CI.
/// Motivated by CVE-2025-64430 corner case; remove the `#[ignore]` and
/// any guarding `assert!` polarity if a fixed-point is added later.
/// Indirect-validator branch narrowing: when an if-condition is a
/// bare result variable whose reaching SSA def is a Call to a
/// callee classified by `classify_input_validator_callee` (e.g.
/// `validateUrlSsrf`, `verifyToken`, `isValidUrl`), the validator's
/// argument is treated as validated on the success branch.
///
/// This pins the SSA-level
/// `apply_input_validator_branch_narrowing` regardless of whether
/// downstream consumers (sink-arg taint, cfg-unguarded-sink) honor
/// `validated_must`. Test asserts the symbol-keyed validation flag
/// is set on the analysis exit state.
///
/// Direct-flow shape (no helper indirection); the helper-summary
/// case still has open architectural gaps (validated_must doesn't
/// propagate through `param_to_sink` summaries, same gap blocks
/// AllowlistCheck-in-helper, see CVE_DEFERRED.md GHSA-4x48-cgf9-q33f).
///
/// Motivated by Novu CVE GHSA-4x48-cgf9-q33f
/// (`const ssrfError = await validateUrlSsrf(child.webhookUrl); if (ssrfError) throw …;`).
#[test]
fn indirect_validator_narrowing_marks_arg_validated() {
let src = br#"
async function handler(req) {
const target = req.query.url;
const ssrfError = await validateUrlSsrf(target);
if (ssrfError) {
throw new Error('blocked');
}
await axios.get(target);
}
"#;
let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE);
let file_cfg = parse_lang(src, "javascript", lang);
let summaries = &file_cfg.summaries;
let findings = analyse_file(
&file_cfg,
summaries,
None,
Lang::JavaScript,
"test.js",
&[],
None,
);
// Direct-flow: validator narrowing should clear axios.get's taint event.
assert!(
findings.is_empty(),
"validator narrowing should suppress direct-flow SSRF; got {} finding(s)",
findings.len()
);
}
/// Regression: `extract_ssa_func_summary` must skip `all_validated`
/// events when populating `param_to_sink` / `param_to_sink_param`.
///
/// Helper bodies whose validator-call branch narrowing fired produce
/// per-param probe events flagged `all_validated=true`. Without
/// summary-extract suppression, callers would still see the helper
/// in their summary's sink set and refire on `helper(taintedArg)`
/// even though the validator inside the helper proved the path
/// safe. The caller can't see the validator (it's behind the
/// summary), so the gap manifests as a precision miss only when
/// helper + caller are in the same file.
///
/// Closes the helper-summary half of Novu CVE GHSA-4x48-cgf9-q33f.
#[test]
fn helper_with_validator_does_not_propagate_to_caller_via_summary() {
let src = br#"
async function getWebhookResponse(child) {
const ssrfError = await validateUrlSsrf(child.webhookUrl);
if (ssrfError) {
throw new Error('blocked');
}
return await axios.post(child.webhookUrl, {});
}
async function handler(req) {
const child = req.body.filter;
const r = await getWebhookResponse(child);
return r;
}
"#;
let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE);
let file_cfg = parse_lang(src, "javascript", lang);
let summaries = &file_cfg.summaries;
let findings = analyse_file(
&file_cfg,
summaries,
None,
Lang::JavaScript,
"test.js",
&[],
None,
);
assert!(
findings.is_empty(),
"helper-with-validator should not propagate sink via summary; got {} finding(s)",
findings.len()
);
}
/// Companion: same shape WITHOUT the validator inside the helper
/// must still fire so the precision gain is targeted. Asserts
/// `all_validated` skip doesn't accidentally suppress unsafe helpers.
#[test]
fn helper_without_validator_still_propagates_to_caller_via_summary() {
let src = br#"
async function getWebhookResponse(child) {
return await axios.post(child.webhookUrl, {});
}
async function handler(req) {
const child = req.body.filter;
const r = await getWebhookResponse(child);
return r;
}
"#;
let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE);
let file_cfg = parse_lang(src, "javascript", lang);
let summaries = &file_cfg.summaries;
let findings = analyse_file(
&file_cfg,
summaries,
None,
Lang::JavaScript,
"test.js",
&[],
None,
);
assert!(
!findings.is_empty(),
"helper-without-validator must still flag the cross-fn SSRF path",
);
}
/// Regression: `validate*`-named callees match
/// `InputValidatorPolarity::ErrorReturning`, bare `if (err) throw`
/// guards the success branch (false branch). `is_valid*`/`is_safe*`
/// callees match `InputValidatorPolarity::BooleanTrueIsValid`, bare
/// `if (!ok) throw` guards the success branch (true branch via
/// `condition_negated`).
#[test]
fn classify_input_validator_callee_polarity_buckets() {
use crate::ssa::type_facts::{InputValidatorPolarity, classify_input_validator_callee};
// ErrorReturning bucket
assert_eq!(
classify_input_validator_callee("validateUrlSsrf"),
Some(InputValidatorPolarity::ErrorReturning)
);
assert_eq!(
classify_input_validator_callee("verifyToken"),
Some(InputValidatorPolarity::ErrorReturning)
);
assert_eq!(
classify_input_validator_callee("validate_url"),
Some(InputValidatorPolarity::ErrorReturning)
);
// BooleanTrueIsValid bucket
assert_eq!(
classify_input_validator_callee("isValidUrl"),
Some(InputValidatorPolarity::BooleanTrueIsValid)
);
assert_eq!(
classify_input_validator_callee("is_valid_email"),
Some(InputValidatorPolarity::BooleanTrueIsValid)
);
assert_eq!(
classify_input_validator_callee("isSafe"),
Some(InputValidatorPolarity::BooleanTrueIsValid)
);
// Negative, names that look like validators but are auth-flavored
// (`checkPermissions`, `is_authorized`) are intentionally not
// matched here; they have separate semantics in the auth pipeline.
assert_eq!(classify_input_validator_callee("checkPermissions"), None);
assert_eq!(classify_input_validator_callee("is_authorized"), None);
assert_eq!(classify_input_validator_callee("randomThing"), None);
// Path-prefix peeling: `obj.validateXxx` should classify the same
// as the bare callee.
assert_eq!(
classify_input_validator_callee("validator.validateUrlSsrf"),
Some(InputValidatorPolarity::ErrorReturning)
);
}
#[test]
#[ignore]
fn cve_2025_64430_three_hop_transitive_documents_depth_limit() {