Python fp and docs updtes (#58)

* refactor: Update comments for clarity and add expectations.json files for performance metrics

* feat: Implement FP guard for JS/TS local-collection receivers to suppress missing ownership checks

* feat: Enhance Rust parameter handling to classify local collections and prevent false ownership checks

* refactor: Simplify code formatting for better readability in multiple files

* refactor: Improve UTF-8 sequence length handling and enhance clarity in loop iteration

* feat: Update Java and Python patterns to include new security rules

* refactor: Improve comment clarity and consistency across multiple Rust files

* refactor: Simplify code formatting for improved readability in integration tests and module files

* refactor: Improve comment formatting and enhance clarity in assertions across multiple files
This commit is contained in:
Eli Peter 2026-04-29 19:53:34 -04:00 committed by GitHub
parent 4db0805de6
commit a438886217
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
291 changed files with 9485 additions and 3851 deletions

View file

@ -1,9 +1,9 @@
//! Taint event emission and conversion to [`crate::taint::Finding`].
//!
//! Extracted from the monolithic `ssa_transfer.rs`. Contains:
//! * [`SsaTaintEvent`] the raw event struct produced by the block-level
//! * [`SsaTaintEvent`], the raw event struct produced by the block-level
//! worklist each time a tainted value reaches a sink.
//! * [`ssa_events_to_findings`] event → `Finding` conversion with the
//! * [`ssa_events_to_findings`], event → `Finding` conversion with the
//! `primary_location` invariant and dedup.
//! * Flow-path reconstruction helpers ([`reconstruct_flow_path`] and
//! operand pickers).
@ -38,14 +38,14 @@ pub struct SsaTaintEvent {
/// `sink_caps`. When multiple [`SinkSite`]s for the same `(param_idx,
/// cap mask)` match, the emission site produces one event per
/// [`SinkSite`] so each downstream [`crate::taint::Finding`] carries a
/// single primary attribution the multi-primary case collapses to
/// single primary attribution, the multi-primary case collapses to
/// multiple single-primary events.
///
/// `None` for:
/// * intra-procedural sinks (`uses_summary == false`), where the
/// caller's sink span already names the dangerous instruction;
/// * summary-resolved sinks whose callee summary carried only cap-only
/// [`SinkSite`]s (no source coordinates e.g. pass-2 transient
/// [`SinkSite`]s (no source coordinates, e.g. pass-2 transient
/// summaries or local `LocalFuncSummary`-only callees).
pub primary_sink_site: Option<SinkSite>,
}
@ -79,7 +79,7 @@ pub(super) fn block_distance(ssa: &SsaBody, source_node: NodeIndex, sink_node: N
}
}
}
0 // unreachable or not connected conservative default
0 // unreachable or not connected, conservative default
}
// ── Flow Path Reconstruction ─────────────────────────────────────────────
@ -204,7 +204,7 @@ pub(super) fn reconstruct_flow_path(
SsaOp::FieldProj { receiver, .. } => {
// Treat field projection as a one-step assignment for
// flow-step reconstruction: taint reaching `obj.f` came
// from `obj`. Phase 4 will refine the witness rendering
// from `obj`. the analysis may refine the witness rendering
// to include the field name in the step.
steps.push(FlowStepRaw {
cfg_node: inst.cfg_node,
@ -270,7 +270,7 @@ fn pick_tainted_operand_call(
///
/// Note: this invariant is intentionally independent of `uses_summary`.
/// The taint-chain flag tracks summary-propagated *taint*, not summary-
/// resolved *sinks* a local source can reach a cross-file sink, so
/// resolved *sinks*, a local source can reach a cross-file sink, so
/// `primary_location.is_some()` does not imply `uses_summary == true`.
pub fn ssa_events_to_findings(
events: &[SsaTaintEvent],
@ -329,7 +329,7 @@ pub fn ssa_events_to_findings(
// Data-integrity invariant: a populated primary_location must at least
// carry resolved line coordinates. `file_rel` may legitimately be
// empty when the scan root is the caller file itself (single-file
// empty, when the scan root is the caller file itself (single-file
// scans), every namespace normalizes to `""` and the callee's site
// inherits that empty path; consumers resolve it against the file
// under analysis. Line==0 is the only filter-worthy invariant.
@ -340,7 +340,7 @@ pub fn ssa_events_to_findings(
// Dedup key includes primary location so multi-site events that
// share a single (source, sink) pair still produce distinct findings
// one per resolved callee-internal site.
//, one per resolved callee-internal site.
let loc_key = primary_location
.as_ref()
.map(|l| (l.file_rel.clone(), l.line, l.col));
@ -374,6 +374,11 @@ pub fn ssa_events_to_findings(
path_hash,
finding_id: String::new(),
alternative_finding_ids: smallvec::SmallVec::new(),
// Per-event mask from the multi-gate dispatch, picks
// exactly the cap that fired (e.g. `Cap::DATA_EXFIL`
// for a `fetch` body-flow finding versus `Cap::SSRF`
// for a URL-flow finding on the same call).
effective_sink_caps: event.sink_caps & *caps,
});
}
}

View file

@ -1,34 +1,10 @@
//! Context-sensitive inline analysis cache, body, and attribution types.
//! Context-sensitive inline analysis, cache, body, and attribution types.
//!
//! Extracted from the monolithic `ssa_transfer.rs`. Contains:
//! * [`ArgTaintSig`] — compact per-arg cap signature used as a cache key.
//! * [`InlineResult`] / [`CachedInlineShape`] / [`ReturnShape`] — the
//! callsite-adapted and callsite-agnostic inline-analysis result types.
//! * [`InlineCache`] — the shared cache map keyed by
//! `(FuncKey, ArgTaintSig)`.
//! * [`CrossFileNodeMeta`] / [`CalleeSsaBody`] — the serde-able bodies
//! persisted to SQLite for cross-file context-sensitive analysis.
//! * [`populate_node_meta`] / [`rebuild_body_graph`] — bookkeeping for
//! cross-file body proxy CFGs.
//!
//! The implementation functions (`inline_analyse_callee`,
//! `apply_cached_shape`, `extract_inline_return_taint`) remain in the
//! parent `mod.rs` because they depend tightly on the block worklist, the
//! `run_ssa_taint_full` entry point, and the callee-resolution pipeline.
//!
//! # Cache key scope and origin attribution
//!
//! The inline-analysis cache below ([`InlineCache`]) is keyed by
//! `(FuncKey, ArgTaintSig)`, where [`ArgTaintSig`] encodes **per-arg
//! capability bits only** — not the identity of the source
//! [`crate::taint::domain::TaintOrigin`]s that produced those caps. The
//! stored value ([`CachedInlineShape`]) captures **only the structural**
//! shape of the callee's return taint: return caps, callee-internal
//! origins (from `Source` ops inside the callee body), and per-parameter
//! provenance flags that record which formal parameters contributed to
//! the return. Caller-specific origin identity is *not* stored — it is
//! re-attributed at cache-apply time from the current call site's
//! argument taint.
//! The cache ([`InlineCache`]) is keyed by `(FuncKey, ArgTaintSig)`,
//! where [`ArgTaintSig`] is per-arg cap bits only (not origin identity).
//! Stored values ([`CachedInlineShape`]) capture the structural shape of
//! the callee's return taint; caller-specific origins are re-attributed
//! at apply time.
use crate::labels::Cap;
use crate::ssa::ir::{SsaBody, Terminator};
@ -42,61 +18,30 @@ use std::collections::HashMap;
/// Maximum SSA blocks in a callee body before skipping inline analysis.
pub(super) const MAX_INLINE_BLOCKS: usize = 500;
/// Compact cache key: per-arg-position cap bits (sorted, non-empty only).
///
/// Two calls with identical `ArgTaintSig` produce identical inline results
/// for soundness purposes (return caps, callee-internal sink activations).
/// Origin identity is **not** part of the key — see the module-level note
/// above on origin-attribution non-determinism.
/// Compact cache key: per-arg-position cap bits (sorted, non-empty
/// only). Origin identity is not part of the key.
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub(crate) struct ArgTaintSig(pub(super) SmallVec<[(usize, u16); 4]>);
/// Call-site-adapted result of inline-analyzing a callee.
///
/// Constructed fresh per call site by `apply_cached_shape` from a stored
/// [`CachedInlineShape`]; carries origins that point to the *current*
/// caller's source chain, not to whichever caller first populated the
/// cache entry.
/// Call-site-adapted result of inline-analyzing a callee. Built fresh
/// per call site so origins point to the current caller's chain.
#[derive(Clone, Debug)]
pub(crate) struct InlineResult {
/// Taint on the return value after inline analysis.
pub(super) return_taint: Option<VarTaint>,
/// PathFact on the return value after inline analysis.
///
/// Non-top when the callee's body provably narrows the
/// [`crate::abstract_interp::PathFact`] of the value it returns (for
/// example, a `sanitize_path(s) -> Option<String>` helper that
/// early-returns on `s.contains("..")` / `s.starts_with('/')`). At
/// apply time the caller sets its call-result SSA value's PathFact to
/// this narrowed fact, so downstream FILE_IO sinks see the sanitised
/// axis regardless of whether a named label-rule exists for the
/// helper. Top when the callee produces no narrowing — matches
/// pre-PathFact behaviour exactly.
/// PathFact on the return value. Non-top when the callee body
/// provably narrows it (e.g. a `sanitize_path` early-returning on
/// `s.contains("..")`).
pub(super) return_path_fact: crate::abstract_interp::PathFact,
/// Per-return-path decomposition of [`Self::return_path_fact`].
///
/// Non-empty when the callee has ≥2 distinct return blocks whose
/// predicate gates differ. Match-arm-sensitive callers pick the
/// entry whose `variant_inner_fact` matches the arm binding's
/// variant; path-resolvable callers may refuse infeasible entries.
/// Callers unable to distinguish paths still consult
/// [`Self::return_path_fact`] (the join of all entries) and see
/// pre-decomposition behaviour.
/// Per-return-path decomposition of `return_path_fact`. Non-empty
/// when the callee has ≥2 return blocks with different predicate
/// gates.
#[allow(dead_code)]
pub(super) return_path_facts: SmallVec<[PathFactReturnEntry; 2]>,
}
/// Structural (callsite-agnostic) summary of an inline-analyzed callee.
///
/// Stored in [`InlineCache`] in place of a fully-attributed `InlineResult`.
/// Origin-identity information that depends on the caller's argument chain
/// is *not* kept here; instead, [`ReturnShape::param_provenance`]
/// records which callee parameter positions contributed seed taint to the
/// return, and the actual caller origins are re-unioned in at apply time.
///
/// `None` means "this callee produced no return taint for the given
/// argument shape". A cached `None` is still a meaningful result — it
/// short-circuits re-analysis on subsequent calls with matching caps.
/// Structural (callsite-agnostic) summary of an inline-analyzed
/// callee. `None` means "no return taint for this arg shape", still
/// meaningful, short-circuits subsequent calls with matching caps.
#[derive(Clone, Debug)]
pub(crate) struct CachedInlineShape(pub(super) Option<ReturnShape>);
@ -107,7 +52,7 @@ pub(crate) struct CachedInlineShape(pub(super) Option<ReturnShape>);
/// origins. See the module-level note above on origin attribution.
#[derive(Clone, Debug)]
pub(crate) struct ReturnShape {
/// Return value caps (cap bits only structural).
/// Return value caps (cap bits only, structural).
pub(super) caps: Cap,
/// Origins produced **inside the callee body** (e.g. `Source` op fired
/// in the callee). `node` is set to a placeholder; at apply time the
@ -115,31 +60,19 @@ pub(crate) struct ReturnShape {
/// stable (from the callee CFG) and preserved as-is.
pub(super) internal_origins: SmallVec<[TaintOrigin; 2]>,
/// Bit i set = callee's `Param(i)` seed taint reached the return value.
/// At apply time, caller's argument origins at matching positions are
/// unioned into the applied `VarTaint`. Params beyond index 63 are
/// dropped (matching `SmallBitSet` semantics); the capped case is rare
/// and still yields cap-correct results.
/// At apply time, caller arg origins at matching positions are
/// unioned into the applied `VarTaint`. Params beyond 63 are
/// dropped (matches `SmallBitSet`); rare and still cap-correct.
pub(super) param_provenance: u64,
/// Whether the receiver (`SelfParam`) seed taint flowed to the return.
/// Whether the receiver (`SelfParam`) seed taint flowed to return.
pub(super) receiver_provenance: bool,
/// Whether the applied `VarTaint` should be tagged `uses_summary`.
pub(super) uses_summary: bool,
/// PathFact of the return value observed from the callee's exit
/// abstract state. Cache-safe because the callee is inline-analysed
/// with [`crate::abstract_interp::PathFact::top`] Param seeds — the
/// resulting fact describes the callee's intrinsic narrowing (e.g.
/// the `Some` arm of a `sanitize(..) -> Option<String>` body
/// proves `dotdot = No`) and does not depend on caller-side
/// narrowing of the argument's PathFact. Top when the callee does
/// not narrow.
/// PathFact of the return value, observed from the callee exit
/// state under Top-seeded Params. Describes the callee's intrinsic
/// narrowing.
pub(super) return_path_fact: crate::abstract_interp::PathFact,
/// Per-return-path [`PathFact`] decomposition of the return value.
///
/// Populated alongside [`Self::return_path_fact`] when the callee
/// has ≥2 distinct return blocks with different predicate gates.
/// Cache-safe for the same reason as `return_path_fact`: entries
/// describe callee-intrinsic narrowing under Top-seeded Params.
/// Empty when no per-path distinction was observed.
/// Per-return-path decomposition of the return value. Populated
/// when the callee has ≥2 return blocks with different predicates.
pub(super) return_path_facts: SmallVec<[PathFactReturnEntry; 2]>,
}
@ -151,50 +84,21 @@ impl CachedInlineShape {
}
}
/// Cache for context-sensitive inline analysis results.
///
/// Keyed by the callee's canonical [`FuncKey`] rather than a bare function
/// name so that same-name definitions (e.g. two `process/1` methods on
/// different classes in the same file) never share or overwrite each
/// other's cache entries. Values are stored as [`CachedInlineShape`]; see
/// the module-level note above for why origins are stripped from the
/// cache value and re-attributed at apply time.
/// Cache for context-sensitive inline analysis results, keyed by
/// canonical [`FuncKey`] so same-name definitions in different scopes
/// never collide.
pub(crate) type InlineCache = HashMap<(FuncKey, ArgTaintSig), CachedInlineShape>;
/// Drop every entry from an inline cache, marking the start of a new
/// convergence epoch.
///
/// Cross-file SCC fixed-point iteration runs pass 2 repeatedly until the
/// merged summaries stop changing. Between iterations the callee-summary
/// inputs to inline analysis may have changed, so results cached under a
/// stale snapshot must not leak into the next iteration — otherwise the
/// engine could converge to a non-fixed-point (reporting a taint result
/// that would not reproduce on a fresh run of the same file order).
///
/// The per-file inline cache is already reconstructed fresh at the top of
/// each [`crate::taint::analyse_file`] call, so in the current code this
/// call is effectively a no-op plumbing hook. Keeping the method (instead
/// of relying on ambient re-construction) makes the lifecycle explicit for
/// any future refactor that moves the cache up into the SCC orchestrator.
#[allow(dead_code)] // semantic hook; used by tests and future shared-cache refactor
/// Drop every entry from the inline cache between SCC fixpoint
/// iterations so stale results don't leak forward.
#[allow(dead_code)]
pub(crate) fn inline_cache_clear_epoch(cache: &mut InlineCache) {
cache.clear();
}
/// Set-equal fingerprint of an inline cache, used by the SCC orchestrator
/// to detect when cross-file inline analysis has reached a fixed point
/// alongside summary convergence.
///
/// Returns a `HashMap` mapping each `(FuncKey, ArgTaintSig)` cache key to
/// the return-value capability bits of its inline result. `HashMap`
/// equality is set-equal (unordered), so two caches with the same entries
/// compare equal regardless of insertion order.
///
/// Origins are intentionally omitted — they are non-deterministic across
/// callers with identical caps (see the module-level note on origin
/// attribution) and would cause the fingerprint to oscillate without
/// reflecting a real precision change.
#[allow(dead_code)] // observability hook; used by tests and future shared-cache refactor
/// Set-equal fingerprint of the inline cache, used by the SCC
/// orchestrator to detect convergence.
#[allow(dead_code)]
pub(crate) fn inline_cache_fingerprint(
cache: &InlineCache,
) -> HashMap<(FuncKey, ArgTaintSig), u16> {
@ -206,24 +110,11 @@ pub(crate) fn inline_cache_fingerprint(
/// CFG node metadata embedded in cross-file callee bodies.
///
/// ## Why a full [`crate::cfg::NodeInfo`] lives here
///
/// An earlier variant carried only the two fields the symex executor reads
/// (`bin_op`, `labels`). That was sufficient for symex but not for the
/// taint engine, which reads ~20 fields off `cfg[inst.cfg_node]` across
/// `transfer_inst`, `collect_block_events`, `compute_succ_states`, and
/// helpers (callee name, `arg_uses`, `arg_callees`, `call_ordinal`,
/// `outer_callee`, `kwargs`, `arg_string_literals`, `ast.span`,
/// `ast.enclosing_func`, `condition_*`, `all_args_literal`, `catch_param`,
/// `parameterized_query`, `in_defer`, `cast_target_type`, `string_prefix`,
/// `taint.uses`, `taint.defines`, `taint.extra_defines`,
/// `taint.const_text`, …). Rather than shuttling each of those through a
/// `CfgView` accessor at every callsite, we store a full serde-able
/// [`crate::cfg::NodeInfo`] snapshot here so the indexed-scan path can
/// rehydrate an equivalent `Cfg` on load (see [`rebuild_body_graph`]).
/// Both scan paths then feed the same `&Cfg` into the taint engine, and
/// cross-file inline fires regardless of whether the body came from pass
/// 1 or from SQLite.
/// Stores a full serde-able [`crate::cfg::NodeInfo`] snapshot rather
/// than projecting individual fields, so the indexed-scan path can
/// rehydrate an equivalent `Cfg` (see [`rebuild_body_graph`]) and feed
/// the same `&Cfg` into the taint engine regardless of whether the
/// body came from pass 1 or SQLite.
#[derive(Clone, Debug, Default, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct CrossFileNodeMeta {
/// Full `NodeInfo` snapshot for this body-local NodeIndex.
@ -268,7 +159,7 @@ pub fn populate_node_meta(body: &mut CalleeSsaBody, cfg: &crate::cfg::Cfg) -> bo
// `compute_succ_states` via `cfg[*cond]`, so without it the synthesized
// cross-file proxy CFG (`rebuild_body_graph`) ends up too small whenever
// the callee body has any conditional branch whose `cond` index sits
// past the maximum `inst.cfg_node` index inline analysis then panics
// past the maximum `inst.cfg_node` index, inline analysis then panics
// with an out-of-bounds index.
let mut referenced: Vec<NodeIndex> = Vec::new();
for block in &body.ssa.blocks {
@ -320,7 +211,7 @@ pub fn rebuild_body_graph(body: &mut CalleeSsaBody) -> bool {
// index. We fill any unreferenced intermediate indices with
// `NodeInfo::default()`.
//
// Walks both instruction `cfg_node`s and `Terminator::Branch.cond`
// Walks both instruction `cfg_node`s and `Terminator::Branch.cond` ,
// the latter is read by `compute_succ_states` via `cfg[*cond]`, so
// missing it produces an OOB panic when a conditional branch's cond
// node has a higher index than any `inst.cfg_node` in the body.
@ -339,7 +230,7 @@ pub fn rebuild_body_graph(body: &mut CalleeSsaBody) -> bool {
}
}
}
// Also consider node_meta keys they should be a subset of the
// Also consider node_meta keys, they should be a subset of the
// SSA-referenced indices, but be defensive.
for &k in body.node_meta.keys() {
if k > max_idx {

File diff suppressed because it is too large Load diff

View file

@ -2,7 +2,7 @@
//! the original monolithic `ssa_transfer.rs`.
//!
//! Contains:
//! * [`SsaTaintState`] the per-block lattice value with `values`,
//! * [`SsaTaintState`], the per-block lattice value with `values`,
//! `validated_must`/`validated_may`, `predicates`, `heap`, `path_env`,
//! `abstract_state`.
//! * [`BindingKey`] / [`seed_lookup`] for cross-body taint seeding.
@ -25,7 +25,7 @@ use std::collections::HashMap;
// NOTE: The per-SSA-value origin cap used to be a hardcoded
// `MAX_ORIGINS: usize = 4`. It is now governed by the stable
// `analysis.engine.max_origins` option (default `32`) see
// `analysis.engine.max_origins` option (default `32`), see
// `crate::utils::analysis_options` and [`effective_max_origins`]. The
// test-only override below still short-circuits the config read so
// `engine_notes_tests.rs` can force a tiny cap to trigger truncation
@ -42,7 +42,7 @@ static WORKLIST_CAP_OVERRIDE: std::sync::atomic::AtomicUsize =
std::sync::atomic::AtomicUsize::new(0);
/// Records the MAX iteration count observed across every
/// `run_ssa_taint_full` call since the most recent reset. Cheaper and
/// more useful for regression tests than the last-call value a cap
/// more useful for regression tests than the last-call value, a cap
/// hit anywhere in the scan is remembered.
pub(super) static MAX_WORKLIST_ITERATIONS: std::sync::atomic::AtomicUsize =
std::sync::atomic::AtomicUsize::new(0);
@ -90,7 +90,7 @@ pub fn reset_worklist_observability() {
/// force `OriginsTruncated` emission on small fixtures.
static MAX_ORIGINS_OVERRIDE: std::sync::atomic::AtomicUsize =
std::sync::atomic::AtomicUsize::new(0);
/// Total number of origins dropped since the most recent reset captured
/// Total number of origins dropped since the most recent reset, captured
/// from `merge_origins` and the post-hoc saturation scan. Used by tests
/// to detect truncation events that don't propagate to a finding (e.g.
/// when the cap is so tight no taint flow survives to emit a sink event).
@ -136,7 +136,7 @@ pub fn reset_origins_observability() {
thread_local! {
/// Per-body engine-note collector. Cleared at the start of each
/// `analyse_body_with_seed` invocation and drained after
/// `run_ssa_taint_full` returns notes are then attached to every
/// `run_ssa_taint_full` returns, notes are then attached to every
/// finding emitted from that body. Living as a thread-local avoids
/// threading a `&RefCell` through the nearly-10-argument transfer
/// struct; inline analysis recursion is intentionally allowed to
@ -148,7 +148,7 @@ thread_local! {
/// was suppressed by an SSA-engine path-safety proof (PathFact
/// `dotdot=No && absolute=No`). Populated by `is_path_safe_for_sink`
/// and consumed by the state-analysis pass to suppress
/// `state-unauthed-access` on the same sink when the taint engine
/// `state-unauthed-access` on the same sink, when the taint engine
/// has already proved the user-controlled input cannot escape into a
/// privileged location, the auth concern on that sink is reduced.
/// Reset at start of `analyse_file`, drained before state analysis.
@ -156,7 +156,7 @@ thread_local! {
RefCell::new(std::collections::HashSet::new());
/// File-level set of CFG sink spans where the SSA engine emitted an
/// `all_validated` event every tainted input to the sink passed
/// `all_validated` event, every tainted input to the sink passed
/// through a recognised validation/sanitisation predicate before
/// reaching it. Distinct from `PATH_SAFE_SUPPRESSED_SPANS`, which
/// is FILE_IO-scoped and feeds state analysis: this set is
@ -167,7 +167,7 @@ thread_local! {
///
/// Without this signal the suppression gate has to fall back to
/// "function emitted at least one taint-unsanitised-flow finding"
/// or "function contains a labelled Sanitizer node" both of
/// or "function contains a labelled Sanitizer node", both of
/// which miss validated/dominated/early-return safety where the
/// engine cleared the flow without firing or hitting an explicit
/// sanitiser.
@ -227,7 +227,7 @@ pub fn take_path_safe_suppressed_spans() -> std::collections::HashSet<(usize, us
/// Record a sink CFG-node span where the SSA engine proved every
/// tainted input was validated (`SsaTaintEvent::all_validated`).
/// Cap-agnostic fires for any sink the engine evaluated and cleared.
/// Cap-agnostic, fires for any sink the engine evaluated and cleared.
/// Consumed by `TaintSuppressionCtx::build` as positive evidence that
/// taint analysis reached this line and proved safety, so AST-pattern
/// findings on the same line can be suppressed without misclassifying
@ -263,7 +263,7 @@ pub fn take_all_validated_spans() -> std::collections::HashSet<(usize, usize)> {
/// into the seed map always specify the owning body's id; readers look
/// up by the scope they know they want (typically their own
/// `parent_body_id`, with a fallback to `BodyId(0)` for entries that
/// the JS/TS two-level solve has re-keyed onto the top-level scope
/// the JS/TS two-level solve has re-keyed onto the top-level scope ,
/// see [`crate::taint::ssa_transfer::filter_seed_to_toplevel`]).
#[derive(Debug, Clone, Hash, Eq, PartialEq)]
pub struct BindingKey {
@ -284,7 +284,7 @@ impl BindingKey {
/// Look up a binding in a seed map.
///
/// Thin wrapper over [`HashMap::get`] retained for call-site readability
/// every seed entry is now exactly scoped to a single `(name,
///, every seed entry is now exactly scoped to a single `(name,
/// BodyId)`, so the lookup is O(1) with no fallback. Writers that want
/// cross-scope reachability must explicitly re-key their entries (see
/// [`crate::taint::ssa_transfer::filter_seed_to_toplevel`]).
@ -299,7 +299,7 @@ pub fn seed_lookup<'a>(
/// Compact key for a heap-field taint cell.
///
/// `(loc, field)` `loc` is the abstract location of the *parent*
/// `(loc, field)`, `loc` is the abstract location of the *parent*
/// (interned by the body's [`crate::pointer::LocInterner`]), `field`
/// is the [`FieldId`] of the projected field. The pair survives lattice
/// joins / leq comparisons by `Ord`-derived sort.
@ -309,16 +309,16 @@ pub struct FieldTaintKey {
pub field: FieldId,
}
/// Pointer-Phase 4 / W4: per-field-cell taint record.
/// per-field-cell taint record.
///
/// Carries the union of writers' taint for the abstract field cell plus
/// two validation channels:
/// * `validated_must` set when *every* writer recorded a value that was
/// * `validated_must`, set when *every* writer recorded a value that was
/// `validated_must` in its own SSA scope. Lattice join intersects
/// (`AND`) matching the symbol-keyed [`SsaTaintState::validated_must`]
/// (`AND`), matching the symbol-keyed [`SsaTaintState::validated_must`]
/// semantics for "validated on every path".
/// * `validated_may` set when *any* writer recorded a `validated_may`
/// value. Lattice join unions (`OR`) matching the symbol-keyed
/// * `validated_may`, set when *any* writer recorded a `validated_may`
/// value. Lattice join unions (`OR`), matching the symbol-keyed
/// [`SsaTaintState::validated_may`] semantics for "validated on some
/// path".
///
@ -332,7 +332,7 @@ pub struct FieldCell {
}
impl FieldCell {
/// Construct a cell with no validation bits convenience for the
/// Construct a cell with no validation bits, convenience for the
/// pre-W4 callers that don't propagate symbol-level validation.
pub fn unvalidated(taint: VarTaint) -> Self {
Self {
@ -365,17 +365,17 @@ pub struct SsaTaintState {
/// interpretation is disabled (`analysis.engine.abstract_interpretation
/// = false`).
pub abstract_state: Option<AbstractState>,
/// Pointer-Phase 3: per-heap-field taint cells, keyed by
/// per-heap-field taint cells, keyed by
/// `(parent_loc, field)`. Sorted by [`FieldTaintKey`] for O(n)
/// merge-join. Populated only when the body's
/// [`crate::pointer::PointsToFacts`] is available
/// (`NYX_POINTER_ANALYSIS=1`); empty otherwise so the lattice join
/// is a strict no-op for pointer-disabled runs. Field reads
/// (`SsaOp::FieldProj`) consult the cells; field writes record into
/// them. Cross-call propagation lands in Phase 5 via the
/// them. Cross-call propagation lands during lowering via the
/// field-granularity `PointsToSummary`.
///
/// Cell shape (Phase 4 / W4): [`FieldCell`] carries `taint` plus
/// Cell shape: [`FieldCell`] carries `taint` plus
/// `validated_must` / `validated_may` flags so validation flows
/// through abstract field / element identity.
pub field_taint: SmallVec<[(FieldTaintKey, FieldCell); 4]>,
@ -403,7 +403,7 @@ impl SsaTaintState {
}
}
/// Pointer-Phase 3: read the field cell at `key`. Returns `None`
/// read the field cell at `key`. Returns `None`
/// when no cell has been recorded (caller should treat as
/// untainted). O(log n) on the sorted [`field_taint`] list.
pub fn get_field(&self, key: FieldTaintKey) -> Option<&FieldCell> {
@ -413,13 +413,13 @@ impl SsaTaintState {
.map(|idx| &self.field_taint[idx].1)
}
/// Pointer-Phase 3 / W4: union `t` into the field cell at `key`,
/// union `t` into the field cell at `key`,
/// recording per-write `validated_must` / `validated_may` channels.
///
/// Maintains sorted invariant. No-op when `t.caps` is empty (so the
/// lattice bottom stays `[]`). When the cell already exists, the
/// validation channels merge with the lattice-join semantics
/// `must` AND-intersects, `may` OR-unions matching the symbol-
/// validation channels merge with the lattice-join semantics ,
/// `must` AND-intersects, `may` OR-unions, matching the symbol-
/// keyed [`SsaTaintState::validated_must`] / `validated_may`
/// semantics so a write coming through a non-validated path tears
/// down `must` while preserving `may` of any earlier validated path.
@ -563,15 +563,15 @@ impl Lattice for SsaTaintState {
}
}
/// Pointer-Phase 3 / W4: merge-join two sorted `field_taint` lists.
/// merge-join two sorted `field_taint` lists.
/// Same shape as [`merge_join_ssa_vars`] but keyed on [`FieldTaintKey`]:
/// * `taint.caps` OR-union
/// * `taint.origins` merged with cap-respecting de-dup
/// * `taint.uses_summary` OR-union
/// * `validated_must` AND-intersect (matches the symbol-keyed
/// * `taint.caps` , OR-union
/// * `taint.origins`, merged with cap-respecting de-dup
/// * `taint.uses_summary`, OR-union
/// * `validated_must`, AND-intersect (matches the symbol-keyed
/// `validated_must` lattice: a path that didn't validate this cell
/// breaks the invariant)
/// * `validated_may` OR-union (any path's validation contributes)
/// * `validated_may`, OR-union (any path's validation contributes)
pub(super) fn merge_join_field_taint(
a: &[(FieldTaintKey, FieldCell)],
b: &[(FieldTaintKey, FieldCell)],
@ -581,7 +581,7 @@ pub(super) fn merge_join_field_taint(
while i < a.len() && j < b.len() {
match a[i].0.cmp(&b[j].0) {
std::cmp::Ordering::Less => {
// Cell present only in `a` counterpart in `b` is the
// Cell present only in `a`, counterpart in `b` is the
// lattice bottom (no validation, no taint), so:
// must = a.must AND false = false
// may = a.may OR false = a.may
@ -637,11 +637,11 @@ pub(super) fn merge_join_field_taint(
/// `a ≤ b` for sorted `field_taint` lists. Used by the convergence
/// check in [`Lattice::leq`]. Per-cell criteria:
///
/// * `taint.caps` `a ⊆ b` (sub-state on caps; matches per-SSA-value
/// * `taint.caps`, `a ⊆ b` (sub-state on caps; matches per-SSA-value
/// `ssa_vars_leq`).
/// * `validated_must` `a.must ⊇ b.must` (super-state on must; same
/// * `validated_must`, `a.must ⊇ b.must` (super-state on must; same
/// shape as the symbol-keyed `validated_must` leq).
/// * `validated_may` `a.may ⊆ b.may` (sub-state on may).
/// * `validated_may`, `a.may ⊆ b.may` (sub-state on may).
///
/// When `b` lacks a key present in `a`, `b`'s side is the lattice
/// bottom: no caps, no validation. `a`'s caps must also be empty
@ -669,12 +669,12 @@ pub(super) fn field_taint_leq(
if (ca.taint.caps - cb.taint.caps).bits() != 0 {
return false;
}
// Must: a ⊇ b every must-validated key in b is must-validated
// Must: a ⊇ b, every must-validated key in b is must-validated
// in a. Equivalently: !cb.must OR ca.must.
if cb.validated_must && !ca.validated_must {
return false;
}
// May: a ⊆ b every may-validated key in a is may-validated
// May: a ⊆ b, every may-validated key in a is may-validated
// in b. Equivalently: !ca.may OR cb.may.
if ca.validated_may && !cb.validated_may {
return false;
@ -735,7 +735,7 @@ pub(super) fn merge_join_ssa_vars(
///
/// Ordering is lexicographic over
/// `(source_span_start, source_span_end, source_kind_tag, node_index)`.
/// `source_span` is the most stable component across bodies cross-body
/// `source_span` is the most stable component across bodies, cross-body
/// remapped origins carry the original byte span explicitly; intra-body
/// origins default to `(0, 0)` and fall through to the secondary keys.
///
@ -760,7 +760,7 @@ fn origin_sort_key(o: &TaintOrigin) -> (usize, usize, u8, usize) {
/// Bounded, deterministic insertion of an origin into a sorted origin
/// set. Returns `true` when `new` was admitted (or de-duplicated against
/// an existing entry), `false` when the cap forced a drop. On drop,
/// the origin with the *largest* sort key is evicted first the caller
/// the origin with the *largest* sort key is evicted first, the caller
/// sees a survivor set that depends only on the input multiset and
/// [`effective_max_origins`], not on insertion order.
///
@ -774,7 +774,7 @@ pub(crate) fn push_origin_bounded(
) -> bool {
// Identity check: same node counts as the same origin. We keep
// node-only dedup to match [`ssa_vars_leq`], which compares origin
// sets by node membership widening dedup here without tightening
// sets by node membership, widening dedup here without tightening
// there would break the monotonicity invariant.
if target.iter().any(|o| o.node == new.node) {
return true;
@ -814,7 +814,7 @@ pub(crate) fn push_origin_bounded(
target.insert(pos, new);
true
} else {
// `new` itself is the worst drop it instead of the survivor.
// `new` itself is the worst, drop it instead of the survivor.
false
}
}
@ -829,7 +829,7 @@ pub(super) fn merge_origins(
a: &SmallVec<[TaintOrigin; 2]>,
b: &SmallVec<[TaintOrigin; 2]>,
) -> SmallVec<[TaintOrigin; 2]> {
// Seed the result with `a` but re-sort defensively in case the
// Seed the result with `a`, but re-sort defensively in case the
// caller constructed `a` through non-bounded paths. Historically
// every write goes through `push_origin_bounded` (or `merge_origins`
// itself), so this resort is a no-op on the steady state but costs
@ -911,7 +911,7 @@ pub(super) fn merge_join_ssa_predicates(
mod origin_cap_tests {
//! Tests for the deterministic, config-driven origin cap. These
//! cover the behavior at the `push_origin_bounded` / `merge_origins`
//! boundary the end-to-end engine-note signal is exercised in
//! boundary, the end-to-end engine-note signal is exercised in
//! `tests/engine_notes_tests.rs`.
use super::*;
@ -1037,7 +1037,7 @@ mod origin_cap_tests {
fn effective_cap_reads_runtime_config_when_override_zero() {
// Override takes priority; override=0 falls through to config.
// `current()` returns the default (32) when no runtime is
// installed which is the state the rest of the test suite runs
// installed, which is the state the rest of the test suite runs
// under. Guard that the fallback path reaches 32.
let _g = TEST_GUARD.lock().unwrap_or_else(|e| e.into_inner());
set_max_origins_override(0);
@ -1053,7 +1053,7 @@ mod origin_cap_tests {
#[cfg(test)]
mod field_taint_tests {
//! Pointer-Phase 3: tests for the heap-field taint cells on
//!: tests for the heap-field taint cells on
//! [`SsaTaintState`]. Cover get/add round-trip, lattice join
//! (cap union + origin merge), and `leq` convergence semantics.
use super::*;
@ -1202,7 +1202,7 @@ mod field_taint_tests {
assert!(cell.validated_must, "a.must AND b.must = true");
assert!(cell.validated_may);
// Now make `b`'s validated_must false must should drop to
// Now make `b`'s validated_must false, must should drop to
// false on the join, may stays at OR.
let mut c = SsaTaintState::initial();
c.add_field(k, taint(Cap::ENV_VAR), false, true);
@ -1213,7 +1213,7 @@ mod field_taint_tests {
}
/// W4 audit: `merge_join_field_taint` OR-unions `validated_may`
/// any path's may-validation contributes to the joined cell.
///, any path's may-validation contributes to the joined cell.
#[test]
fn lattice_validated_may_unions_on_join() {
let k = key(1, 7);
@ -1275,7 +1275,7 @@ mod field_taint_tests {
a.leq(&b),
"must super-state and equal caps: a ≤ b should hold"
);
// Reverse: b.must=false, a.must=true for b ≤ a, we need
// Reverse: b.must=false, a.must=true, for b ≤ a, we need
// b.must ⊇ a.must which is false ⊇ true = false. So b ≤ a
// must fail.
assert!(!b.leq(&a), "b lacks the must invariant a holds");
@ -1289,7 +1289,7 @@ mod field_taint_tests {
assert!(!a2.leq(&b2), "a.may=true is NOT ⊆ b.may=false");
}
/// Pointer-Phase 3 / A8 audit: the field_taint lattice is monotone
/// the field_taint lattice is monotone
/// and converges under a deterministic enumeration of inputs.
/// Caps grow (OR), `uses_summary` grows (OR), origins grow modulo
/// the cap (merge_origins is bounded). Joins must:
@ -1409,7 +1409,7 @@ mod field_taint_tests {
/// `field_taint_leq` is the soundness gate for worklist
/// convergence: once `next ≤ acc`, the worklist halts. Pin that
/// `leq` is consistent with `join` i.e. `s.leq(s.join(t))` holds
/// `leq` is consistent with `join`, i.e. `s.leq(s.join(t))` holds
/// for any `s, t`. Without this, the worklist could loop
/// indefinitely on inputs whose join produces a state not
/// dominated by both inputs.

View file

@ -1,11 +1,11 @@
//! SSA function-summary and container-flow extraction.
//!
//! Extracted from the monolithic `ssa_transfer.rs`. Contains:
//! * [`extract_ssa_func_summary`] runs per-parameter taint probes and
//! * [`extract_ssa_func_summary`], runs per-parameter taint probes and
//! synthesises an [`crate::summary::ssa_summary::SsaFuncSummary`] with
//! source caps, return transforms, per-path transforms, and sink site
//! attribution.
//! * [`extract_container_flow_summary`] structural scan for
//! * [`extract_container_flow_summary`], structural scan for
//! `param_container_to_return` + `param_to_container_store` pairs.
//! * Private helpers for predicate-hash summarisation, abstract-transfer
//! derivation, callback source detection, and return-type inference.
@ -123,15 +123,15 @@ pub fn extract_ssa_func_summary_full(
.collect();
// Collect all param SSA values to exclude from return cap collection.
// Param values persist with their seeded taint throughout the function
// Param values persist with their seeded taint throughout the function ,
// we only want caps on derived values (call results, assigns) at return.
let all_param_values: std::collections::HashSet<SsaValue> =
param_info.iter().map(|(_, _, v)| *v).collect();
// Per-return-block observation captured alongside the aggregate return
// caps. Each entry records one return block's exit state caps
// caps. Each entry records one return block's exit state, caps
// contributed on that path, path-predicate hash, known_true/false bits,
// and the return SSA value's abstract fact so the per-param loop can
// and the return SSA value's abstract fact, so the per-param loop can
// emit one [`ReturnPathTransform`] per distinct predicate gate.
struct ReturnBlockObs {
/// Caps at the return SSA value (or joined live values for
@ -141,7 +141,7 @@ pub fn extract_ssa_func_summary_full(
/// (passthrough fallback).
param_caps: Cap,
/// Deterministic hash of the predicate gate at this return.
/// `0` means "no predicate gate" an unguarded return.
/// `0` means "no predicate gate", an unguarded return.
predicate_hash: u64,
/// `PredicateSummary::known_true` bits intersected across all
/// tracked variables at this return. Encoded via
@ -268,7 +268,7 @@ pub fn extract_ssa_func_summary_full(
}
}
} else {
// Return(None): implicit return fall back to all live values.
// Return(None): implicit return, fall back to all live values.
for (val, taint) in &exit.values {
if all_param_values.contains(val) {
block_param_caps |= taint.caps;
@ -348,7 +348,7 @@ pub fn extract_ssa_func_summary_full(
// Per-return-path PathFact decomposition derived from the baseline
// probe (no seeded taint). Abstract facts on the return rv are
// independent of taint seeding they describe the function's
// independent of taint seeding, they describe the function's
// intrinsic narrowing, so the baseline run captures them without
// per-param noise.
//
@ -388,7 +388,7 @@ pub fn extract_ssa_func_summary_full(
let mut param_to_sink: Vec<(usize, SmallVec<[SinkSite; 1]>)> = Vec::new();
let mut param_to_sink_param = Vec::new();
// Per-param return-path decomposition. Populated only when the param
// has ≥2 distinct return-block predicate hashes a single-return-path
// has ≥2 distinct return-block predicate hashes, a single-return-path
// callee is already precise via `param_to_return`.
let mut param_return_paths: Vec<(
usize,
@ -417,7 +417,7 @@ pub fn extract_ssa_func_summary_full(
// expressions (e.g. `file._source.uri`) as their own
// [`SsaOp::Param`] ops with composite `var_name`s like
// `"file._source.uri"`. These phantom Params are the values
// actually used as call arguments not the formal-param SSA
// actually used as call arguments, not the formal-param SSA
// value the seed targets. Without this, the per-param probe
// misses cross-call sinks because the call's arg SSA value is
// a phantom Param with no seed entry, so `transfer_inst::Param`
@ -447,7 +447,7 @@ pub fn extract_ssa_func_summary_full(
let (return_caps, events, _, per_return_obs) = run_probe(seed);
// Subtract baseline source_caps we only want param-contributed caps
// Subtract baseline source_caps, we only want param-contributed caps
let param_return_caps = return_caps & !source_caps;
if !param_return_caps.is_empty() {
@ -464,7 +464,7 @@ pub fn extract_ssa_func_summary_full(
// observed return block, derive a `ReturnPathTransform` mirroring
// the aggregate logic (prefer derived caps, fall back to param
// caps, strip baseline source caps). Only emit when ≥2 distinct
// predicate hashes are present a single-hash summary adds no
// predicate hashes are present, a single-hash summary adds no
// signal over the aggregate `param_to_return`.
if per_return_obs.len() >= 2 {
let mut per_path: SmallVec<[crate::summary::ssa_summary::ReturnPathTransform; 2]> =
@ -477,7 +477,7 @@ pub fn extract_ssa_func_summary_full(
};
let block_contributed = block_return_caps & !source_caps;
let transform_kind = if block_contributed.is_empty() {
// No caps on this path param does not reach return
// No caps on this path, param does not reach return
// under this predicate. A `StripBits(all)` records
// "all bits cleared" so downstream join preserves the
// disparity with other paths.
@ -513,9 +513,31 @@ pub fn extract_ssa_func_summary_full(
}
}
// Collect sink caps + primary-location sites from events + per-arg-position detail
// Collect sink caps + primary-location sites from events + per-arg-position detail.
//
// Skip events flagged `all_validated`: every tainted SSA value
// that reached the sink was already proved validated by a
// dominating predicate (AllowlistCheck / TypeCheck /
// ValidationCall, including the indirect-validator branch
// narrowing for `validate*` / `is_valid*` callees). Those
// events would have been dropped by `ssa_events_to_findings` at
// the per-file finding step; carrying them into
// `param_to_sink` / `param_to_sink_param` re-publishes a sink
// attribution callers can no longer suppress, because the
// caller can't see the validator that lives inside the
// callee body.
//
// Strict-additive: `all_validated` is set only when every
// tainted operand at the sink has its `var_name` in
// `state.validated_may`, single-path single-validator helpers
// cleanly skip; mixed-tainted-with-some-unvalidated events
// still propagate. Closes the helper-summary precision gap
// surfaced by Novu CVE GHSA-4x48-cgf9-q33f.
let mut param_sites: SmallVec<[SinkSite; 1]> = SmallVec::new();
for event in &events {
if event.all_validated {
continue;
}
for pos in extract_sink_arg_positions(event, ssa) {
param_to_sink_param.push((idx, pos, event.sink_caps));
}
@ -601,14 +623,14 @@ pub fn extract_ssa_func_summary_full(
// Per-parameter abstract-domain transfers.
//
// Derived structurally from the SSA body no additional taint probes.
// Derived structurally from the SSA body, no additional taint probes.
// Three-step inference per parameter:
// 1. Identity: return SSA value at every return block traces back to
// this parameter (possibly through assigns / phi merges all feeding
// from the same param).
// 2. Callee-intrinsic bound: baseline `return_abstract` carries a
// concrete fact (bounded interval or known prefix) that holds
// regardless of caller input record it once per parameter as
// regardless of caller input, record it once per parameter as
// `Clamped` / `LiteralPrefix` so the caller sees the bound even
// when it has no abstract info on its own argument.
// 3. Top: default; the entry is omitted (empty transfer is meaningless).
@ -630,14 +652,14 @@ pub fn extract_ssa_func_summary_full(
param_return_paths,
return_path_facts,
points_to,
// Pointer-Phase 5 extension — empty until the field-granularity
// extension, empty until the field-granularity
// extractor is wired (`NYX_POINTER_ANALYSIS=1` only). Default
// path stays bit-identical to today.
field_points_to: crate::summary::points_to::FieldPointsToSummary::empty(),
// Populated post-extraction in
// `taint::lower_all_functions_from_bodies` once SSA optimisation
// has computed `opt.type_facts`. Empty here means the
// extractor itself doesn't carry receiver-type info the
// extractor itself doesn't carry receiver-type info, the
// caller patches it in.
typed_call_receivers: Vec::new(),
}
@ -699,14 +721,14 @@ pub(super) fn summarise_return_predicates(state: &SsaTaintState) -> (u64, u8, u8
///
/// `return_abstract` is the callee's intrinsic baseline (from the no-seed
/// probe). When present, it describes a fact that holds for the return
/// regardless of parameter input so it can be attached as a
/// regardless of parameter input, so it can be attached as a
/// `Clamped` / `LiteralPrefix` transform to every parameter that flows to
/// the return.
///
/// Identity detection is structural: walk the return values back through
/// [`SsaOp::Assign`] / [`SsaOp::Phi`] chains (bounded) and check whether
/// every leaf resolves to the same [`SsaOp::Param`]. The trace is cheap
/// and can only produce `Identity` for passthrough callees anything
/// and can only produce `Identity` for passthrough callees, anything
/// more complex degrades to the baseline fact or `Top`.
fn derive_abstract_transfer(
ssa: &SsaBody,
@ -780,7 +802,7 @@ fn derive_abstract_transfer(
}
// Derive a baseline-invariant transform from `return_abstract`. This is
// the "callee intrinsic" fact that always holds each parameter that
// the "callee intrinsic" fact that always holds, each parameter that
// flows to the return gets it attached as the conservative transfer.
let baseline_invariant: Option<AbstractTransfer> = return_abstract.map(|av| {
let interval = match (av.interval.lo, av.interval.hi) {
@ -805,7 +827,7 @@ fn derive_abstract_transfer(
} else if let Some(base) = baseline_invariant.as_ref() {
// Baseline intrinsic bound applies to every parameter that could
// reach the return. We conservatively attach it to all params
// at apply time the caller meets it with the real return
//, at apply time the caller meets it with the real return
// abstract (also from this same summary), so double-counting
// would collapse to the tighter of the two.
transfer = base.clone();
@ -879,7 +901,7 @@ fn infer_summary_return_type(
lang: Lang,
) -> Option<crate::ssa::type_facts::TypeKind> {
// Find blocks with Return terminators, then look at the last defined value
// in those blocks if it's a Call with a known constructor, that's our type.
// in those blocks, if it's a Call with a known constructor, that's our type.
for block in &ssa.blocks {
if !matches!(block.terminator, Terminator::Return(_)) {
continue;
@ -965,7 +987,7 @@ pub(crate) fn extract_container_flow_summary(
// `trace_to_param` will happily return any `SsaOp::Param { index }`, but
// scoped lowering synthesises `Param` ops for external captures (module
// imports, free identifiers) at indices beyond the formal parameter count.
// Those must not enter the summary the key's arity only covers formal
// Those must not enter the summary, the key's arity only covers formal
// params, and an out-of-range index trips `ssa_summary_fits_arity`, forcing
// the reconciliation probe to generate a synthetic disambiguator that no
// caller will ever look up.
@ -1035,7 +1057,7 @@ pub(crate) fn extract_container_flow_summary(
};
// Trace container to positional param (SelfParam → None, so
// when the container is the receiver we skip the caller
// when the container is the receiver we skip, the caller
// tracks that via `receiver_to_container_store` if needed).
// Same arity filter as above: reject synthetic Param ops that
// were injected for free captures.

View file

@ -221,7 +221,7 @@ mod cross_file_tests {
mod inline_cache_epoch_tests {
//! Hooks for cross-file SCC joint fixed-point iteration.
//!
//! These do not exercise the full inline pipeline they lock down the
//! These do not exercise the full inline pipeline, they lock down the
//! semantic contract of [`inline_cache_clear_epoch`] and
//! [`inline_cache_fingerprint`] so the SCC orchestrator can rely on:
//!
@ -229,7 +229,7 @@ mod inline_cache_epoch_tests {
//! * `fingerprint` is deterministic across equivalent caches (same
//! keys → same bytes). Two caches with identical entries produce
//! identical fingerprints regardless of insertion order.
//! * `fingerprint` changes when return caps change the signal the
//! * `fingerprint` changes when return caps change, the signal the
//! orchestrator will use to detect inline-cache convergence.
use super::super::*;
@ -675,7 +675,7 @@ mod worklist_tests {
#[test]
fn dense_successors_no_duplicates() {
// Many successors, some repeated old O(n) contains() would be slow here
// Many successors, some repeated, old O(n) contains() would be slow here
let mut wl = VecDeque::new();
let mut in_wl = HashSet::new();
@ -735,8 +735,8 @@ mod primary_sink_location_tests {
//! [`SsaTaintEvent::primary_sink_site`] →
//! [`crate::taint::Finding::primary_location`].
//!
//! The test is deliberately low-level it wires up synthetic SSA and
//! drives the three emission stages directly so any future refactor
//! The test is deliberately low-level, it wires up synthetic SSA and
//! drives the three emission stages directly, so any future refactor
//! that drops the site on the floor between stages fails here rather
//! than only at the corpus/benchmark layer.
use super::super::*;
@ -841,7 +841,7 @@ mod primary_sink_location_tests {
/// If this fails, something on the summary→event→finding path
/// (`pick_primary_sink_sites`, `emit_ssa_taint_events`, or
/// `ssa_events_to_findings`) has silently stopped forwarding
/// coordinates. Fixing that path — not this test — is the right
/// coordinates. Fixing that path, not this test, is the right
/// response.
#[test]
fn ssa_summary_sinksite_surfaces_as_finding_primary_location() {
@ -863,7 +863,7 @@ mod primary_sink_location_tests {
};
// Drive the three emission stages with the summary's own
// `param_to_sink` that is what summary resolution feeds in the
// `param_to_sink`, that is what summary resolution feeds in the
// real pipeline.
let tainted: Vec<(SsaValue, Cap, SmallVec<[TaintOrigin; 2]>)> = vec![(
SsaValue(0),
@ -944,7 +944,7 @@ mod goto_succ_propagation_tests {
#[test]
fn goto_propagates_to_every_succ_on_three_way_collapse() {
// Build a block with Terminator::Goto(1) but succs = [1, 2, 3] the
// Build a block with Terminator::Goto(1) but succs = [1, 2, 3], the
// shape lowering emits for a 3-way fanout.
let block = SsaBlock {
id: BlockId(0),
@ -1001,7 +1001,7 @@ mod goto_succ_propagation_tests {
pointer_facts: None,
};
// A non-bottom exit state the test only cares that *every* succ
// A non-bottom exit state, the test only cares that *every* succ
// receives a clone of it, so any distinguishable state works.
let mut exit_state = SsaTaintState::initial();
exit_state.values.push((
@ -1259,7 +1259,7 @@ mod goto_succ_propagation_tests {
fn is_path_safe_for_sink_unknown_axis_returns_false() {
use crate::abstract_interp::PathFact;
// Only dotdot is cleared absolute stays Maybe → not path-safe.
// Only dotdot is cleared, absolute stays Maybe → not path-safe.
let half_fact = PathFact::default().with_dotdot_cleared();
assert!(!half_fact.is_path_safe());
}
@ -1328,9 +1328,9 @@ mod goto_succ_propagation_tests {
}
}
// ── Phase 4.2: receiver_candidates_for_type_lookup walks FieldProj ──────
// ── receiver_candidates_for_type_lookup walks FieldProj ──────
//
// After Phase 2 SSA decomposition, `c.client.send(req)` lowers to
// After SSA decomposition, `c.client.send(req)` lowers to
// v_c = Param("c", 0)
// v_client = FieldProj(v_c, "client")
// v_call = Call("send", receiver: v_client, args: [v_req])
@ -1430,7 +1430,7 @@ mod receiver_candidates_field_proj_tests {
fn field_proj_receiver_walks_to_typed_root_in_go() {
// Go is not Rust, so pre-Phase-4 the candidate walk would have
// returned ONLY the immediate receiver (v2 = FieldProj). With
// Phase 4 we walk through FieldProj.receiver to recover v0 (the
// We walk through FieldProj.receiver to recover v0 (the
// typed root `c`).
let body = body_with_field_proj_chain();
let cands =
@ -1516,7 +1516,7 @@ mod receiver_candidates_field_proj_tests {
}
}
// ── Phase 6 hierarchy fan-out: ResolvedSummary union semantics ──────────
// ── Hierarchy: ResolvedSummary union semantics ──────────
//
// `merge_resolved_summaries_fanout` is invoked at virtual-dispatch call
// sites where the receiver's static type has multiple concrete
@ -1553,7 +1553,7 @@ mod fanout_merge_tests {
}
}
/// B1 caps that grow taint signal (source/sink/receiver_to_sink)
/// B1, caps that grow taint signal (source/sink/receiver_to_sink)
/// are unioned. sanitizer_caps are intersected so only bits
/// stripped by EVERY implementer count as cleared at the call site.
#[test]
@ -1581,7 +1581,7 @@ mod fanout_merge_tests {
);
}
/// B2 propagates_taint is OR'd; propagating_params is the union
/// B2, propagates_taint is OR'd; propagating_params is the union
/// (any implementer's propagator counts).
#[test]
fn merge_propagation_unions() {
@ -1600,7 +1600,7 @@ mod fanout_merge_tests {
assert_eq!(params, vec![0, 1, 2]);
}
/// B3 param_to_sink merges per-parameter caps (OR). An impl
/// B3, param_to_sink merges per-parameter caps (OR). An impl
/// that adds a sink at param N composes with another impl that
/// adds a different cap at the same N.
#[test]
@ -1630,7 +1630,7 @@ mod fanout_merge_tests {
);
}
/// B4 param_to_sink_sites merges per-parameter site lists with
/// B4, param_to_sink_sites merges per-parameter site lists with
/// PartialEq dedup. The same site appearing in both impls (e.g.
/// inherited definition) must not be reported twice.
#[test]
@ -1675,7 +1675,7 @@ mod fanout_merge_tests {
assert!(sites.iter().any(|s| s == &unique_b));
}
/// B5 SSA-precision fields are dropped on disagreement. Two
/// B5, SSA-precision fields are dropped on disagreement. Two
/// summaries with different `return_type` collapse to None;
/// agreement is preserved.
#[test]
@ -1704,7 +1704,7 @@ mod fanout_merge_tests {
);
}
/// B6 abstract_transfer + param_return_paths drop on
/// B6, abstract_transfer + param_return_paths drop on
/// disagreement (precise predicate-path data is not safely
/// composable across distinct function bodies).
#[test]
@ -1737,7 +1737,7 @@ mod fanout_merge_tests {
);
}
/// B7 empty + empty = empty (no panic on degenerate inputs).
/// B7, empty + empty = empty (no panic on degenerate inputs).
#[test]
fn merge_empties_is_identity() {
let m = merge_resolved_summaries_fanout(empty(), empty());
@ -1748,7 +1748,7 @@ mod fanout_merge_tests {
}
}
// ── Pointer-Phase 3 / W1: synthetic field-WRITE round-trip ──────────────
//── synthetic field-WRITE round-trip ──────────────
//
// SSA lowering populates `SsaBody.field_writes` with entries that lift a
// synthetic base-update Assign (`obj.f = rhs`) into a structural field
@ -1918,8 +1918,8 @@ mod field_write_tests {
crate::pointer::analyse_body(body, crate::cfg::BodyId(7))
}
/// Reuse `make_cfg`'s nodes the body's instructions all reference
/// them so `transfer_inst` can index `cfg[cfg_node]`.
/// Reuse `make_cfg`'s nodes, the body's instructions all reference
/// them, so `transfer_inst` can index `cfg[cfg_node]`.
fn drive(body: &SsaBody, pf: &PointsToFacts) -> SsaTaintState {
// We need a CFG that contains the bodies' cfg_nodes.
let (cfg, _, _, _, _) = make_cfg();
@ -1998,7 +1998,7 @@ mod field_write_tests {
/// Pointer-disabled run (`pointer_facts: None`): no field cell is
/// recorded, no taint flows through the `obj.cache` projection. The
/// strict-additive contract pointer-disabled behaviour is the
/// strict-additive contract, pointer-disabled behaviour is the
/// pre-W1 baseline.
#[test]
fn pointer_disabled_run_produces_no_field_taint() {
@ -2047,8 +2047,8 @@ mod field_write_tests {
state.field_taint.is_empty(),
"pointer-disabled run must not populate field_taint",
);
// FieldProj reads still produce the receiver's existing taint
// none so no entry for SsaValue(3) either.
// FieldProj reads still produce the receiver's existing taint ,
// none, so no entry for SsaValue(3) either.
assert!(state.get(SsaValue(3)).is_none());
let _ = cache_id;
}
@ -2059,7 +2059,7 @@ mod field_write_tests {
/// projected value's symbol-level `validated_must` from the cell.
///
/// This is the key invariant: validation flows *through* abstract
/// field identity the read recovers what the write recorded.
/// field identity, the read recovers what the write recorded.
#[test]
fn write_then_read_preserves_validated_must() {
let (body, cache_id) = make_body();
@ -2208,7 +2208,7 @@ mod field_write_tests {
},
};
let pf = crate::pointer::analyse_body(&body, crate::cfg::BodyId(0));
// v0 is Const → empty pt the hook should not insert anything.
// v0 is Const → empty pt, the hook should not insert anything.
assert!(
pf.pt(SsaValue(0)).is_empty(),
"Const value should have empty pt set",
@ -2259,7 +2259,7 @@ mod field_write_tests {
}
}
// ── Pointer-Phase 4 / W2: container ELEM write/read round-trip ──────────
//── container ELEM write/read round-trip ──────────
//
// Container methods like `arr.push(v)` / `arr.shift()` flow per-element
// taint through the `Field(_, ELEM)` cells on `SsaTaintState`. These
@ -2351,7 +2351,7 @@ mod container_elem_tests {
state
}
/// `arr.push(source()); arr.shift()` the read picks the source's
/// `arr.push(source()); arr.shift()`, the read picks the source's
/// caps up via the ELEM cell.
#[test]
fn container_write_then_read_round_trips_taint() {
@ -2456,7 +2456,7 @@ mod container_elem_tests {
);
// Drive the transfer. `e := arr.shift()` goes through the
// existing Call arm the W2 path is the *write* on `push`.
// existing Call arm, the W2 path is the *write* on `push`.
// The element-read side already exists on `analyse_body`; the
// taint engine doesn't yet read field cells through call-result
// paths (Call args are walked by Call's own argument-taint
@ -2482,7 +2482,7 @@ mod container_elem_tests {
}
}
/// W4: `arr.push(validate(src)); arr.shift()` the push records
/// W4: `arr.push(validate(src)); arr.shift()`, the push records
/// `validated_must = true` on the ELEM cell because the pushed
/// value's symbol carried `validated_must`. The shift call result
/// reads through the cell and seeds the result symbol's
@ -2761,7 +2761,7 @@ mod container_elem_tests {
}
}
// ── Pointer-Phase 5 / W3: cross-call field-points-to application ────────
//── cross-call field-points-to application ────────
//
// `apply_field_points_to_writes` is the resolver-side hook that turns
// callee-summary `field_points_to.param_field_writes` into caller-side
@ -2783,7 +2783,7 @@ mod cross_call_field_tests {
use smallvec::smallvec;
use std::collections::HashMap;
/// W3 / W4: shared empty interner these unit tests don't seed
/// W3 / W4: shared empty interner, these unit tests don't seed
/// validation bits, so a fresh interner is sufficient for the
/// `interner` parameter on `apply_field_points_to_writes`.
fn empty_interner() -> SymbolInterner {
@ -2861,23 +2861,23 @@ mod cross_call_field_tests {
state
}
/// Callee summary with `param_field_writes[(0, ["cache"])]`
/// Callee summary with `param_field_writes[(0, ["cache"])]` ,
/// "callee writes cache field on parameter 0 (obj)".
/// Caller passes `(obj, source)` to this callee `arg 0 = obj`,
/// Caller passes `(obj, source)` to this callee, `arg 0 = obj`,
/// but the W3 hook resolves the *value at arg position 0* as the
/// receiver of the field write, populating its pt's cells.
///
/// We model the caller as `callee(obj, source)` with arg 0 = obj
/// (the receiver) and arg 1 = source (the value being written).
/// The callee's signature is `fn store(obj, value) { obj.cache = value; }`
/// so the field write on param 0 is keyed by `pt(obj)` and the
///, so the field write on param 0 is keyed by `pt(obj)` and the
/// taint comes from arg 1's caps. Our helper conservatively unions
/// every arg's taint into the cell which over-tints (for this
/// every arg's taint into the cell, which over-tints (for this
/// shape, arg 0's pt member becomes the loc, with arg 0's own taint
/// applied), but is sound.
///
/// To make the test precise, we model the simpler shape `fn store(obj)
/// { obj.cache = source(); }` callee writes a literal source into
/// { obj.cache = source(); }`, callee writes a literal source into
/// `obj.cache`, with no value parameter. Then the caller-side hook
/// only sees param 0's taint (zero), so the cell is empty and the
/// test fails.
@ -2886,7 +2886,7 @@ mod cross_call_field_tests {
/// at the call site arg 0 carries source taint. The hook then
/// records (pt(arg0_value), cache) ← arg0_value's taint. In a
/// real callee this corresponds to "callee writes its parameter
/// value into a self.cache field internally" but the spread we
/// value into a self.cache field internally", but the spread we
/// validate is just substitute-and-mirror.
#[test]
fn cross_call_writes_into_param_field_cell() {
@ -2947,7 +2947,7 @@ mod cross_call_field_tests {
fn cross_call_receiver_field_uses_max_sentinel() {
let (body, cache_id, pf) = caller_body();
let mut state = SsaTaintState::initial();
// Seed receiver with taint SsaValue(0) is the param/receiver.
// Seed receiver with taint, SsaValue(0) is the param/receiver.
state.set(
SsaValue(0),
VarTaint {
@ -3026,7 +3026,7 @@ mod cross_call_field_tests {
);
}
/// Field names the caller never interned are skipped silently
/// Field names the caller never interned are skipped silently ,
/// no FieldProj read in the caller could observe such a cell.
#[test]
fn cross_call_unknown_field_name_skipped() {
@ -3062,7 +3062,7 @@ mod cross_call_field_tests {
);
}
/// Overflow summary is treated conservatively as no-op the
/// Overflow summary is treated conservatively as no-op, the
/// engine cannot soundly cell-flood, so it skips entirely.
#[test]
fn cross_call_overflow_summary_is_noop() {
@ -3117,7 +3117,7 @@ mod cross_call_field_tests {
//
// `SsaTaintState.add_field` already routes through `merge_origins`, but
// the FieldProj READ path used to walk the cell's origins inline,
// deduping by node only meaning a cell with N>cap origins surfaced
// deduping by node only, meaning a cell with N>cap origins surfaced
// all N to the projected SSA value. After A7, the read path uses
// `push_origin_bounded`, ensuring the cap-driven survivor selection
// applies on read too.
@ -3225,7 +3225,7 @@ mod field_taint_origin_cap_tests {
let (body, cache_id, cfg, _n_proj) = build_body();
let pf = crate::pointer::analyse_body(&body, crate::cfg::BodyId(0));
// Pre-populate the (Param, cache) cell with 4 origins
// Pre-populate the (Param, cache) cell with 4 origins ,
// 2× the cap. The `add_field` path already truncates via
// `merge_origins`, so we go through it 4 times to grow.
let mut state = SsaTaintState::initial();
@ -3326,14 +3326,14 @@ mod field_taint_origin_cap_tests {
// the field_taint cells.
//
// Two scenarios:
// 1. `must_validated_flows_through_join` both predecessor blocks
// 1. `must_validated_flows_through_join`, both predecessor blocks
// write the cell with `validated_must = true`. After the join, the
// cell at the read site retains `validated_must = true` (AND
// intersection of two `true`s).
// 2. `early_exit_branch_drops_validated_must` only one predecessor
// 2. `early_exit_branch_drops_validated_must`, only one predecessor
// writes; the other reaches the read block via an empty branch.
// After the join, the cell has `validated_must = false`,
// `validated_may = true` W4's must/may intersection in action.
// `validated_may = true`, W4's must/may intersection in action.
#[cfg(test)]
mod pointer_lattice_worklist_tests {
use super::super::*;
@ -3425,7 +3425,7 @@ mod pointer_lattice_worklist_tests {
succs: smallvec![BlockId(1), BlockId(2)],
};
// Block 1: synth `obj.cache = src` field_writes[v2] = (v0, cache_id)
// Block 1: synth `obj.cache = src`, field_writes[v2] = (v0, cache_id)
let block1 = SsaBlock {
id: BlockId(1),
phis: vec![],
@ -3441,7 +3441,7 @@ mod pointer_lattice_worklist_tests {
succs: smallvec![BlockId(3)],
};
// Block 2: identical synth write keeps both branches
// Block 2: identical synth write, keeps both branches
// contributing the same cell so AND-intersection of must
// preserves true on the join.
let block2 = SsaBlock {
@ -3459,7 +3459,7 @@ mod pointer_lattice_worklist_tests {
succs: smallvec![BlockId(3)],
};
// Block 3: read FieldProj uses obj from a phi between B1 and B2.
// Block 3: read, FieldProj uses obj from a phi between B1 and B2.
let block3 = SsaBlock {
id: BlockId(3),
phis: vec![SsaInst {
@ -3634,7 +3634,7 @@ mod pointer_lattice_worklist_tests {
);
}
/// A2.b: early-exit branch only B1 writes, B2 reaches B3 via
/// A2.b: early-exit branch, only B1 writes, B2 reaches B3 via
/// an empty body. After the join, the cell exists (B1 wrote
/// it), but `validated_must` is `false` (B2 didn't write, the
/// orphan-side merge clears `must` per the W4 lattice rule);
@ -3642,7 +3642,7 @@ mod pointer_lattice_worklist_tests {
///
/// To exercise the validation channels we synthesise the cell
/// directly at the appropriate exit state, then run the
/// worklist's join via two `SsaTaintState::join()` calls the
/// worklist's join via two `SsaTaintState::join()` calls, the
/// body's worklist itself doesn't seed `validated_must` on the
/// rhs of an Assign, so we model the "writer recorded must=true"
/// scenario at the lattice level rather than driving it through