mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-21 20:18:06 +02:00
Python fp and docs updtes (#58)
* refactor: Update comments for clarity and add expectations.json files for performance metrics * feat: Implement FP guard for JS/TS local-collection receivers to suppress missing ownership checks * feat: Enhance Rust parameter handling to classify local collections and prevent false ownership checks * refactor: Simplify code formatting for better readability in multiple files * refactor: Improve UTF-8 sequence length handling and enhance clarity in loop iteration * feat: Update Java and Python patterns to include new security rules * refactor: Improve comment clarity and consistency across multiple Rust files * refactor: Simplify code formatting for improved readability in integration tests and module files * refactor: Improve comment formatting and enhance clarity in assertions across multiple files
This commit is contained in:
parent
4db0805de6
commit
a438886217
291 changed files with 9485 additions and 3851 deletions
|
|
@ -59,7 +59,7 @@ impl BaseAliasResult {
|
|||
///
|
||||
/// For each entry `(dst_val, src_val)` where copy prop replaced `dst` with
|
||||
/// `src`, looks up the original variable names. If both are plain identifiers
|
||||
/// (no dots — i.e. not field paths), they are registered as base aliases.
|
||||
/// (no dots, i.e. not field paths), they are registered as base aliases.
|
||||
/// Transitive closure is computed so `b = a; c = b` yields group `{a, b, c}`.
|
||||
pub fn compute_base_aliases(
|
||||
copy_map: &HashMap<SsaValue, SsaValue>,
|
||||
|
|
@ -103,7 +103,7 @@ pub fn compute_base_aliases(
|
|||
let ra = find(parent, a);
|
||||
let rb = find(parent, b);
|
||||
if ra != rb {
|
||||
// Arbitrary root choice — alphabetically smaller becomes root
|
||||
// Arbitrary root choice, alphabetically smaller becomes root
|
||||
// for determinism.
|
||||
if ra < rb {
|
||||
parent.insert(rb, ra);
|
||||
|
|
@ -130,7 +130,7 @@ pub fn compute_base_aliases(
|
|||
None => continue,
|
||||
};
|
||||
|
||||
// Only alias plain idents — dotted paths (field accesses) are tracked
|
||||
// Only alias plain idents, dotted paths (field accesses) are tracked
|
||||
// independently in SSA and handled by field-aware suppression.
|
||||
if dst_name.contains('.') || src_name.contains('.') {
|
||||
continue;
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ pub enum ConstLattice {
|
|||
Bool(bool),
|
||||
/// Null / nil / None.
|
||||
Null,
|
||||
/// Multiple possible values — not constant.
|
||||
/// Multiple possible values, not constant.
|
||||
Varying,
|
||||
}
|
||||
|
||||
|
|
@ -70,7 +70,7 @@ impl ConstLattice {
|
|||
return ConstLattice::Str(inner.to_string());
|
||||
}
|
||||
|
||||
// Bare string (no quotes) — treat as string constant
|
||||
// Bare string (no quotes), treat as string constant
|
||||
ConstLattice::Str(trimmed.to_string())
|
||||
}
|
||||
|
||||
|
|
@ -283,7 +283,7 @@ fn eval_inst(inst: &SsaInst, values: &HashMap<SsaValue, ConstLattice>) -> ConstL
|
|||
| SsaOp::SelfParam
|
||||
| SsaOp::CatchParam => ConstLattice::Varying,
|
||||
// FieldProj: projecting a field is dynamic with respect to the
|
||||
// const-propagation lattice — there is no general way to fold
|
||||
// const-propagation lattice, there is no general way to fold
|
||||
// `obj.field` to a known scalar at this phase. Returning Varying
|
||||
// matches Call: callers needing field-level constness will go
|
||||
// through the points-to / heap analysis.
|
||||
|
|
@ -452,7 +452,7 @@ fn mark_edge_executable(
|
|||
if executable_blocks.insert(to) {
|
||||
cfg_worklist.push_back(to);
|
||||
} else {
|
||||
// Block already executable but new edge — re-evaluate phis
|
||||
// Block already executable but new edge, re-evaluate phis
|
||||
cfg_worklist.push_back(to);
|
||||
}
|
||||
}
|
||||
|
|
@ -863,7 +863,7 @@ mod tests {
|
|||
|
||||
/// Const parsing must round-trip integer signs. i64::MIN/MAX must
|
||||
/// parse without overflow; arbitrary text falls back to a bare-string
|
||||
/// const (current contract — tested here so a future change is
|
||||
/// const (current contract, tested here so a future change is
|
||||
/// caught explicitly).
|
||||
#[test]
|
||||
fn const_parse_extremes_and_fallback() {
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ pub fn copy_propagate(body: &mut SsaBody, cfg: &Cfg) -> (usize, HashMap<SsaValue
|
|||
if uses.len() == 1 {
|
||||
let src = uses[0];
|
||||
let info = &cfg[inst.cfg_node];
|
||||
// Skip if the node has labels — sanitizers, sources, sinks
|
||||
// Skip if the node has labels, sanitizers, sources, sinks
|
||||
// have semantic meaning that must be preserved.
|
||||
if !info.taint.labels.is_empty() {
|
||||
continue;
|
||||
|
|
@ -244,7 +244,7 @@ mod tests {
|
|||
}
|
||||
|
||||
/// A four-deep copy chain v3 = v2 = v1 = v0 must collapse to v0
|
||||
/// in a single `copy_propagate` pass — the resolved replacement
|
||||
/// in a single `copy_propagate` pass, the resolved replacement
|
||||
/// map drives downstream alias recovery, so the *transitive*
|
||||
/// closure must be exposed, not just the immediate parent.
|
||||
#[test]
|
||||
|
|
@ -390,7 +390,7 @@ mod tests {
|
|||
}
|
||||
|
||||
/// Skip path 2: numeric-length reads (`arr.length`, `map.size`)
|
||||
/// have a different type from their source — propagating through
|
||||
/// have a different type from their source, propagating through
|
||||
/// would erase the Int type fact.
|
||||
#[test]
|
||||
fn copy_through_numeric_length_access_is_not_propagated() {
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ pub fn eliminate_dead_defs(body: &mut SsaBody, cfg: &Cfg) -> usize {
|
|||
/// condition variable. Without counting these, a value used solely by a
|
||||
/// terminator (the canonical case for short helpers like
|
||||
/// `def f(s): return s`) is judged dead, and DCE strips every instruction
|
||||
/// in the body — leaving empty blocks whose terminators reference
|
||||
/// in the body, leaving empty blocks whose terminators reference
|
||||
/// nonexistent SsaValues, breaking downstream analyses (per-return-path
|
||||
/// PathFact narrowing, inline-summary extraction, etc.).
|
||||
fn build_use_counts(body: &SsaBody) -> HashMap<SsaValue, usize> {
|
||||
|
|
@ -170,8 +170,8 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn dead_const_removed() {
|
||||
// v0 = const("42") — unused, should be removed
|
||||
// v1 = source() — must survive even if unused
|
||||
// v0 = const("42"), unused, should be removed
|
||||
// v1 = source(), must survive even if unused
|
||||
let mut cfg: Cfg = Graph::new();
|
||||
let n0 = cfg.add_node(make_cfg_node(StmtKind::Seq));
|
||||
let n1 = cfg.add_node(make_cfg_node(StmtKind::Seq));
|
||||
|
|
@ -228,7 +228,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn dead_sanitizer_label_preserved() {
|
||||
// v0 has a Sanitizer label on its CFG node — must survive even if unused
|
||||
// v0 has a Sanitizer label on its CFG node, must survive even if unused
|
||||
use crate::labels::{Cap, DataLabel};
|
||||
|
||||
let mut cfg: Cfg = Graph::new();
|
||||
|
|
@ -277,7 +277,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn dead_source_label_preserved() {
|
||||
// v0 has a Source label on its CFG node — must survive even if unused
|
||||
// v0 has a Source label on its CFG node, must survive even if unused
|
||||
use crate::labels::{Cap, DataLabel};
|
||||
|
||||
let mut cfg: Cfg = Graph::new();
|
||||
|
|
@ -541,7 +541,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn used_def_preserved() {
|
||||
// v0 = const("42"), v1 = assign(v0) — v0 is used, both survive
|
||||
// v0 = const("42"), v1 = assign(v0), v0 is used, both survive
|
||||
let mut cfg: Cfg = Graph::new();
|
||||
let n0 = cfg.add_node(make_cfg_node(StmtKind::Seq));
|
||||
let n1 = cfg.add_node(make_cfg_node(StmtKind::Seq));
|
||||
|
|
@ -597,7 +597,7 @@ mod tests {
|
|||
}
|
||||
|
||||
/// DCE must NEVER remove a Call instruction even when its result has
|
||||
/// zero uses — calls have side effects (I/O, throws, mutations) that
|
||||
/// zero uses, calls have side effects (I/O, throws, mutations) that
|
||||
/// cannot be modeled as SSA-value uses. This is the conservative
|
||||
/// invariant `is_dead()` enforces; regressing it would silently drop
|
||||
/// real-world code from analysis (sinks, sanitizers expressed as
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@
|
|||
//! Key design:
|
||||
//! - HeapObjectId is keyed by allocation-site SsaValue (deterministic, zero-cost)
|
||||
//! - PointsToSet is bounded to `analysis.engine.max_pointsto` entries
|
||||
//! (default 32, widening on overflow — see [`effective_max_pointsto`]).
|
||||
//! (default 32, widening on overflow, see [`effective_max_pointsto`]).
|
||||
//! Overflow drops emit an [`crate::engine_notes::EngineNote::PointsToTruncated`]
|
||||
//! note and increment [`POINTSTO_TRUNCATION_COUNT`] so operators can
|
||||
//! tell when the cap is firing on their corpus.
|
||||
|
|
@ -16,7 +16,7 @@
|
|||
//! - HeapSlot::Index(u64) for constant-index container access (proven by const propagation)
|
||||
//! - HeapSlot::Elements for coarse element access (push/pop, dynamic index, overflow)
|
||||
//! - Intraprocedural: constant-index sensitivity is guaranteed when const propagation proves it
|
||||
//! - Interprocedural: best-effort — relies on correct const_values threading (already handled)
|
||||
//! - Interprocedural: best-effort, relies on correct const_values threading (already handled)
|
||||
//! - Unknown/unproven indices fall back to Elements (conservative)
|
||||
//! - Analysis runs as a pre-pass in optimize_ssa(), like type_facts
|
||||
|
||||
|
|
@ -32,7 +32,7 @@ use serde::{Deserialize, Serialize};
|
|||
use smallvec::SmallVec;
|
||||
use std::collections::HashMap;
|
||||
|
||||
// Heap origin cap used to be `const MAX_HEAP_ORIGINS: usize = 4` — now
|
||||
// Heap origin cap used to be `const MAX_HEAP_ORIGINS: usize = 4`, now
|
||||
// governed by the shared `analysis.engine.max_origins` knob through
|
||||
// `crate::taint::ssa_transfer::push_origin_bounded`. Unifying the two
|
||||
// lattices behind a single tunable means operators raise *one* value to
|
||||
|
|
@ -47,7 +47,7 @@ static MAX_POINTSTO_OVERRIDE: std::sync::atomic::AtomicUsize =
|
|||
/// Total heap-object members dropped by [`PointsToSet`] truncation since
|
||||
/// the last reset. Captured from `insert`/`union` so tests (and
|
||||
/// operators inspecting scan output) can detect truncation events that
|
||||
/// don't propagate to a finding — e.g. when the cap is tight enough
|
||||
/// don't propagate to a finding, e.g. when the cap is tight enough
|
||||
/// that no taint flow survives to emit a sink event.
|
||||
pub(crate) static POINTSTO_TRUNCATION_COUNT: std::sync::atomic::AtomicUsize =
|
||||
std::sync::atomic::AtomicUsize::new(0);
|
||||
|
|
@ -114,7 +114,7 @@ pub const MAX_TRACKED_INDICES: usize = 8;
|
|||
|
||||
/// Distinguishes constant-index container access from coarse element access.
|
||||
///
|
||||
/// `Elements` is the conservative default — all container elements merge into
|
||||
/// `Elements` is the conservative default, all container elements merge into
|
||||
/// a single taint. `Index(n)` provides per-index precision when the index is
|
||||
/// provably a non-negative integer constant (via the function's own const
|
||||
/// propagation pass).
|
||||
|
|
@ -302,10 +302,10 @@ impl HeapTaint {
|
|||
/// union of per-slot taint), matching the `SsaTaintState` pattern.
|
||||
///
|
||||
/// Load semantics:
|
||||
/// - `load(id, Index(n))`: union of `(id, Index(n))` and `(id, Elements)` —
|
||||
/// - `load(id, Index(n))`: union of `(id, Index(n))` and `(id, Elements)` ,
|
||||
/// indexed reads also see taint from dynamic/push operations.
|
||||
/// - `load(id, Elements)`: union of `(id, Elements)` and ALL `(id, Index(*))`
|
||||
/// entries — dynamic reads conservatively see all indexed taint.
|
||||
/// entries, dynamic reads conservatively see all indexed taint.
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub struct HeapState {
|
||||
entries: SmallVec<[((HeapObjectId, HeapSlot), HeapTaint); 4]>,
|
||||
|
|
@ -927,7 +927,7 @@ mod tests {
|
|||
set_max_pointsto_override(4);
|
||||
reset_points_to_observability();
|
||||
|
||||
// a = {0,1,2,3}, b = {4,5,6} — union wants 7 members; cap is 4
|
||||
// a = {0,1,2,3}, b = {4,5,6}, union wants 7 members; cap is 4
|
||||
// so 3 members are dropped. Deterministic order: smallest
|
||||
// ids survive.
|
||||
let mut a = PointsToSet::empty();
|
||||
|
|
@ -1215,7 +1215,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn heap_elements_load_unions_all_indices() {
|
||||
// Store to Index(0) and Index(2) — Elements load should see both
|
||||
// Store to Index(0) and Index(2), Elements load should see both
|
||||
let mut h = HeapState::empty();
|
||||
let id = HeapObjectId(SsaValue(0));
|
||||
h.store(id, HeapSlot::Index(0), Cap::HTML_ESCAPE, &[origin(0)]);
|
||||
|
|
|
|||
|
|
@ -20,33 +20,33 @@
|
|||
//!
|
||||
//! Invariants are split into two groups:
|
||||
//!
|
||||
//! **Group A — SSA integrity (must hold unconditionally):**
|
||||
//! **Group A, SSA integrity (must hold unconditionally):**
|
||||
//!
|
||||
//! 1. `BlockId` indexing — `blocks[i].id == BlockId(i)`
|
||||
//! 1. `BlockId` indexing, `blocks[i].id == BlockId(i)`
|
||||
//! 2. Entry block has no predecessors
|
||||
//! 3. Pred/succ symmetry — `B.succs.contains(S)` ⇔ `S.preds.contains(B)`
|
||||
//! 4. Phi placement — every phi appears in `block.phis` (never in body)
|
||||
//! 5. Phi operand arity — ≤ `block.preds.len()`
|
||||
//! 6. Phi operand sources — every `(pred_bid, _)` operand has
|
||||
//! 3. Pred/succ symmetry, `B.succs.contains(S)` ⇔ `S.preds.contains(B)`
|
||||
//! 4. Phi placement, every phi appears in `block.phis` (never in body)
|
||||
//! 5. Phi operand arity, ≤ `block.preds.len()`
|
||||
//! 6. Phi operand sources, every `(pred_bid, _)` operand has
|
||||
//! `block.preds.contains(pred_bid)`
|
||||
//! 7. Unique SSA definitions — every `SsaValue` is defined at most once
|
||||
//! 7. Unique SSA definitions, every `SsaValue` is defined at most once
|
||||
//! across all phi + body instructions
|
||||
//! 8. `value_defs` coverage — every defined `SsaValue.0` is a valid index
|
||||
//! 8. `value_defs` coverage, every defined `SsaValue.0` is a valid index
|
||||
//! into `value_defs`, and `value_defs[v.0].block` matches the block
|
||||
//! containing the defining instruction
|
||||
//! 9. `cfg_node_map` consistency — every `(node, SsaValue)` pair points
|
||||
//! 9. `cfg_node_map` consistency, every `(node, SsaValue)` pair points
|
||||
//! to an instruction whose `cfg_node == node`
|
||||
//!
|
||||
//! **Group B — terminator and reachability (loose, reflecting lowering):**
|
||||
//! **Group B, terminator and reachability (loose, reflecting lowering):**
|
||||
//!
|
||||
//! 10. Terminator/succs agreement *subset* form:
|
||||
//! * `Goto(t)` → `succs.contains(t)` — extras tolerated
|
||||
//! * `Goto(t)` → `succs.contains(t)`, extras tolerated
|
||||
//! (3-successor collapse fallback)
|
||||
//! * `Branch{t, f, …}` → `succs` contains both `t` and `f`
|
||||
//! * `Return`/`Unreachable` → no constraint on `succs` (CFG may carry
|
||||
//! finally/cleanup continuation edges that downstream analysis
|
||||
//! propagates through)
|
||||
//! 11. Reachability from entry — tolerated exceptions:
|
||||
//! 11. Reachability from entry, tolerated exceptions:
|
||||
//! * blocks that appear as the `catch` side of an exception edge
|
||||
//!
|
||||
//! Group B is deliberately permissive: the SSA body's `succs` field is the
|
||||
|
|
@ -61,8 +61,8 @@ use super::ir::*;
|
|||
|
||||
/// Errors returned by targeted invariant checks.
|
||||
///
|
||||
/// Wraps a list of human-readable violation messages — one per offending
|
||||
/// block — so callers can include every failure in a single panic /
|
||||
/// Wraps a list of human-readable violation messages, one per offending
|
||||
/// block, so callers can include every failure in a single panic /
|
||||
/// warning.
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct InvariantError {
|
||||
|
|
@ -106,12 +106,12 @@ pub fn check_structural_invariants(body: &SsaBody) -> Vec<String> {
|
|||
errors
|
||||
}
|
||||
|
||||
/// Every block carrying an [`SsaOp::CatchParam`] — an exception-handler
|
||||
/// entry — must be reachable from either the function entry (via normal
|
||||
/// Every block carrying an [`SsaOp::CatchParam`], an exception-handler
|
||||
/// entry, must be reachable from either the function entry (via normal
|
||||
/// flow) or from at least one entry in [`SsaBody::exception_edges`].
|
||||
///
|
||||
/// When this fails, the CFG builder has produced an orphan catch block
|
||||
/// that should have been wired up as an exception successor but was not —
|
||||
/// that should have been wired up as an exception successor but was not ,
|
||||
/// a real construction bug that otherwise manifests as silent false
|
||||
/// negatives in resource-cleanup / exception-flow findings.
|
||||
pub fn check_catch_block_reachability(body: &SsaBody) -> Result<(), InvariantError> {
|
||||
|
|
@ -252,7 +252,7 @@ fn check_pred_succ_symmetry(body: &SsaBody, errors: &mut Vec<String>) {
|
|||
}
|
||||
|
||||
fn check_terminator_succ_agreement(body: &SsaBody, errors: &mut Vec<String>) {
|
||||
// Group B — loose agreement. See module docs for rationale.
|
||||
// Group B, loose agreement. See module docs for rationale.
|
||||
for block in &body.blocks {
|
||||
match &block.terminator {
|
||||
Terminator::Goto(target) => {
|
||||
|
|
@ -301,7 +301,7 @@ fn check_terminator_succ_agreement(body: &SsaBody, errors: &mut Vec<String>) {
|
|||
}
|
||||
}
|
||||
Terminator::Return(_) | Terminator::Unreachable => {
|
||||
// Loose by design — cleanup/finally continuation edges in
|
||||
// Loose by design, cleanup/finally continuation edges in
|
||||
// `succs` are expected. Downstream consumers (taint
|
||||
// `compute_succ_states`, SCCP `process_terminator`) treat
|
||||
// `succs` as authoritative and propagate across these edges,
|
||||
|
|
@ -443,7 +443,7 @@ fn check_reachability(body: &SsaBody, errors: &mut Vec<String>) {
|
|||
// Multi-root BFS: start from the entry *and* from every catch target
|
||||
// recorded in `exception_edges`. Exception-handler blocks are reached
|
||||
// via stripped exception edges, so from the SSA body's perspective they
|
||||
// look like roots — as does anything transitively reachable from them
|
||||
// look like roots, as does anything transitively reachable from them
|
||||
// (e.g. a `finally` block chained after a `catch`).
|
||||
let mut visited = vec![false; n];
|
||||
let mut stack: Vec<BlockId> = Vec::new();
|
||||
|
|
@ -487,7 +487,7 @@ fn check_reachability(body: &SsaBody, errors: &mut Vec<String>) {
|
|||
/// fingerprint have the same block structure, terminator shape, per-block
|
||||
/// phi/body instruction counts and op-kind sequences. SsaValue numbers are
|
||||
/// not part of the fingerprint, so renumbering between runs does not cause
|
||||
/// spurious diffs — only shape changes do.
|
||||
/// spurious diffs, only shape changes do.
|
||||
///
|
||||
/// Phis are emitted in their natural (insertion) order. Lowering now drives
|
||||
/// phi placement through a `BTreeSet`, so that order is deterministic
|
||||
|
|
|
|||
|
|
@ -24,21 +24,14 @@ pub struct BlockId(pub u32);
|
|||
pub struct FieldId(pub u32);
|
||||
|
||||
impl FieldId {
|
||||
/// Pointer-Phase 4 sentinel for the abstract "any element of a
|
||||
/// container" field. Steensgaard-grade precision: every numeric
|
||||
/// or dynamic index access (`arr[i]`, `arr.shift()`, `map[k]`)
|
||||
/// projects through the same `Field(pt(container), ELEM)` cell so
|
||||
/// per-element taint propagation is independent of the SSA value
|
||||
/// referencing the container.
|
||||
///
|
||||
/// `u32::MAX` is reserved by convention; the per-body
|
||||
/// [`FieldInterner`] never assigns it because interning is
|
||||
/// monotone-ascending from `0` and bodies don't approach 4 billion
|
||||
/// fields. Consumers should compare with `==` rather than reach
|
||||
/// into the wrapped `u32`.
|
||||
/// Sentinel for the abstract "any element of a container" field.
|
||||
/// Every numeric or dynamic index access (`arr[i]`, `arr.shift()`,
|
||||
/// `map[k]`) projects through the same `Field(pt(container), ELEM)`
|
||||
/// cell. `u32::MAX` is reserved; the per-body interner never
|
||||
/// assigns it.
|
||||
pub const ELEM: FieldId = FieldId(u32::MAX);
|
||||
|
||||
/// "Tainted at every field" wildcard sentinel — distinct from
|
||||
/// "Tainted at every field" wildcard sentinel, distinct from
|
||||
/// [`Self::ELEM`] (which is container-element semantics: every
|
||||
/// numeric/dynamic index access projects through it).
|
||||
/// `ANY_FIELD` represents the case where a writeback-shaped sink
|
||||
|
|
@ -91,17 +84,14 @@ impl FieldInterner {
|
|||
/// Read-only lookup: returns the [`FieldId`] for `name` if it has
|
||||
/// already been interned, or `None` otherwise.
|
||||
///
|
||||
/// Used by cross-call resolvers (Pointer-Phase 5 / W3) to avoid
|
||||
/// growing the caller's interner with field names introduced
|
||||
/// solely by the callee summary — such IDs would never be referenced
|
||||
/// by any other instruction in the caller's body, so the cells
|
||||
/// would be write-only and consume space without contributing
|
||||
/// to taint flow.
|
||||
/// Used by cross-call resolvers to avoid growing the caller's
|
||||
/// interner with field names introduced solely by callee summaries
|
||||
///, such cells would be write-only.
|
||||
pub fn lookup(&self, name: &str) -> Option<FieldId> {
|
||||
// Walk `names` directly so we don't require the post-deserialise
|
||||
// `ensure_lookup()` rebuild before this method is callable.
|
||||
// Callers usually own `&SsaBody` — interning was either done at
|
||||
// lowering time or via `ensure_lookup` post-deserialise — so the
|
||||
// Callers usually own `&SsaBody`, interning was either done at
|
||||
// lowering time or via `ensure_lookup` post-deserialise, so the
|
||||
// hot path goes through the `lookup` table; the linear walk is
|
||||
// a fallback for the (small) deserialised-but-not-rebuilt case.
|
||||
if let Some(&id) = self.lookup.get(name) {
|
||||
|
|
@ -168,7 +158,7 @@ pub enum SsaOp {
|
|||
Call {
|
||||
callee: String,
|
||||
/// Original textual full path when SSA decomposed a chained receiver.
|
||||
/// `None` when the callee was not rewritten — `callee` already holds
|
||||
/// `None` when the callee was not rewritten, `callee` already holds
|
||||
/// the source-level textual form.
|
||||
///
|
||||
/// **Debug / display only.** Analysis code must walk the SSA receiver
|
||||
|
|
@ -188,7 +178,7 @@ pub enum SsaOp {
|
|||
/// Models member-access expressions (`obj.field`) as a first-class SSA
|
||||
/// op. Lowering walks the receiver tree so chained accesses like
|
||||
/// `c.writer.header` produce a chain of `FieldProj` ops with explicit
|
||||
/// per-step receivers — eliminating the textual-prefix parsing that
|
||||
/// per-step receivers, eliminating the textual-prefix parsing that
|
||||
/// previously misclassified deep receivers (the gin/context.go FP).
|
||||
///
|
||||
/// `field` is interned in the owning [`SsaBody`]'s [`FieldInterner`].
|
||||
|
|
@ -223,7 +213,7 @@ pub enum SsaOp {
|
|||
///
|
||||
/// Emitted by SSA lowering as a synthesized instruction in the entry
|
||||
/// block and referenced from phi operands whose incoming edge does
|
||||
/// not carry a definition of the phi's variable — e.g. a try/catch
|
||||
/// not carry a definition of the phi's variable, e.g. a try/catch
|
||||
/// rejoin where a variable is only defined on the normal path, or
|
||||
/// an early-return branch on a later-defined variable.
|
||||
///
|
||||
|
|
@ -269,7 +259,7 @@ pub enum Terminator {
|
|||
/// `targets` lists the per-case successor blocks (order matches the
|
||||
/// source-order of cases in the switch); `default` is the fallback
|
||||
/// branch taken when no case matches. Block `succs` remain the
|
||||
/// authoritative flow set — the terminator is a structured summary.
|
||||
/// authoritative flow set, the terminator is a structured summary.
|
||||
///
|
||||
/// Emitted only for switch-like dispatch whose semantics are
|
||||
/// guaranteed-exclusive across cases (e.g. Go `switch`, Java
|
||||
|
|
@ -285,11 +275,11 @@ pub enum Terminator {
|
|||
///
|
||||
/// `Some(c)` records the constant value the scrutinee must equal for
|
||||
/// the corresponding target to be taken. `None` means the literal is
|
||||
/// unknown — emitted for synthetic ≥3-way CFG fanouts or for case
|
||||
/// unknown, emitted for synthetic ≥3-way CFG fanouts or for case
|
||||
/// patterns that aren't plain literals (OR-patterns, ranges, guards).
|
||||
///
|
||||
/// When omitted/empty (length zero), all targets behave as "unknown
|
||||
/// literal" — preserves backward compatibility with consumers that
|
||||
/// literal", preserves backward compatibility with consumers that
|
||||
/// only inspect `targets`/`default`.
|
||||
#[serde(default)]
|
||||
case_values: SmallVec<[Option<ConstValue>; 4]>,
|
||||
|
|
@ -342,19 +332,17 @@ pub struct SsaBody {
|
|||
pub exception_edges: Vec<(BlockId, BlockId)>,
|
||||
/// Per-body interner for [`SsaOp::FieldProj`] field names.
|
||||
///
|
||||
/// Empty until the lowering phase emits FieldProj ops (Phase 2 of the
|
||||
/// field-projections rollout). Cross-body callers (cross-file
|
||||
/// summaries, debug serialization) MUST resolve interned ids through
|
||||
/// this interner before transporting field references to other bodies.
|
||||
/// Empty until lowering emits FieldProj ops. Cross-body callers
|
||||
/// (cross-file summaries, debug serialization) MUST resolve interned
|
||||
/// ids through this interner before transporting field references
|
||||
/// to other bodies.
|
||||
#[serde(default)]
|
||||
pub field_interner: FieldInterner,
|
||||
/// Pointer-Phase 3 / W1: side-table mapping a synthetic base-update
|
||||
/// [`SsaOp::Assign`]'s defined value back to the `(receiver, field)`
|
||||
/// pair it represents. Populated by SSA lowering at the
|
||||
/// `obj.f = rhs` synthesis point so the taint engine can recognise
|
||||
/// the synthetic assign as a structural field WRITE — the assigned
|
||||
/// value is the new "obj" value, the use is the rhs, and the side-
|
||||
/// table records `(prior_obj_value, FieldId("f"))`.
|
||||
/// Side-table mapping a synthetic base-update [`SsaOp::Assign`]'s
|
||||
/// defined value back to the `(receiver, field)` pair it
|
||||
/// represents. Populated by lowering at the `obj.f = rhs` synthesis
|
||||
/// point so the taint engine can treat the synthetic assign as a
|
||||
/// structural field WRITE.
|
||||
///
|
||||
/// Empty by default; only synthetic assigns whose enclosing source
|
||||
/// statement was a dotted-path assignment (`a.b.c = …`) appear here.
|
||||
|
|
@ -505,10 +493,10 @@ mod tests {
|
|||
assert_eq!(uses, vec![SsaValue(1)]);
|
||||
}
|
||||
|
||||
/// Pointer-Phase 4 / A6 audit: the [`FieldId::ELEM`] sentinel is
|
||||
/// the [`FieldId::ELEM`] sentinel is
|
||||
/// reserved for "any element of a container". The interner assigns
|
||||
/// IDs monotonically from `0`, so the sentinel `u32::MAX` can only
|
||||
/// collide if the body declares ~4 billion fields — a corner case
|
||||
/// collide if the body declares ~4 billion fields, a corner case
|
||||
/// no realistic codebase reaches. Pin the contract with a stress
|
||||
/// loop so future implementation drift can't silently shift IDs to
|
||||
/// the sentinel value.
|
||||
|
|
@ -526,7 +514,7 @@ mod tests {
|
|||
// Lookup of the sentinel name (used by W3 to round-trip
|
||||
// container-element flow through summary) must NOT match a
|
||||
// real interned name even when the same name is interned.
|
||||
// The wire-format keeps `<elem>` as a *string marker* — it
|
||||
// The wire-format keeps `<elem>` as a *string marker*, it
|
||||
// never goes through `intern`. Instead, callers compare
|
||||
// explicitly against `FieldId::ELEM`.
|
||||
assert_ne!(interner.intern("<elem>"), FieldId::ELEM);
|
||||
|
|
|
|||
165
src/ssa/lower.rs
165
src/ssa/lower.rs
|
|
@ -29,16 +29,16 @@ use super::ir::*;
|
|||
/// - Construct the `Call` op with `callee = bare_method_name`,
|
||||
/// `callee_text = Some(original_callee.to_string())`,
|
||||
/// `receiver = Some(final_receiver_value)`.
|
||||
/// - Use the returned receiver as the implicit method receiver — do NOT
|
||||
/// - Use the returned receiver as the implicit method receiver, do NOT
|
||||
/// add the chain root or any intermediate field name to `args`.
|
||||
///
|
||||
/// **Decomposition rules** (Phase 2 of the field-projections rollout):
|
||||
/// **Decomposition rules**:
|
||||
/// - Skip when the callee contains zero `.` characters (no member access)
|
||||
/// or only one `.` (single-dot case is handled by the existing
|
||||
/// `info.call.receiver` channel without needing a `FieldProj` op).
|
||||
/// - Bail when any "complex" token appears in the callee — `(`, `)`,
|
||||
/// - Bail when any "complex" token appears in the callee, `(`, `)`,
|
||||
/// `[`, `]`, `::`, `->`, `?`, `<`, `>`, `*`, `&`, `:` (other than `::`
|
||||
/// already filtered), or whitespace — signaling the callee text isn't
|
||||
/// already filtered), or whitespace, signaling the callee text isn't
|
||||
/// a clean `<ident>.<ident>...` chain we can safely split on `.`.
|
||||
/// - The first segment must be a known SSA variable in `var_stacks`;
|
||||
/// otherwise the chain root is unresolvable and we bail.
|
||||
|
|
@ -221,7 +221,7 @@ fn lower_to_ssa_inner(
|
|||
// 4b. For per-function scope: identify external variables (used but not defined)
|
||||
// and inject synthetic Param defs at entry block so rename can find them.
|
||||
// When formal_params is supplied, reorder so formal params come first in
|
||||
// declaration order — this makes Param indices correspond to call-site positions.
|
||||
// declaration order, this makes Param indices correspond to call-site positions.
|
||||
//
|
||||
let external_vars = if scope.is_some() && !scope_all && !scope_nop {
|
||||
let raw = identify_external_uses(cfg, &blocks_nodes, &var_defs);
|
||||
|
|
@ -277,7 +277,7 @@ fn lower_to_ssa_inner(
|
|||
}
|
||||
|
||||
// 7b. Debug assertions: verify structural invariants.
|
||||
// The helper body is `debug_assert!` only, so it's a no-op in release —
|
||||
// The helper body is `debug_assert!` only, so it's a no-op in release ,
|
||||
// call unconditionally to avoid a dead_code warning when the lib is
|
||||
// built without `--tests`.
|
||||
debug_assert_bfs_ordering(&block_preds);
|
||||
|
|
@ -451,10 +451,10 @@ fn collect_reachable(
|
|||
/// Form basic blocks from filtered CFG nodes.
|
||||
///
|
||||
/// Returns:
|
||||
/// - blocks_nodes: Vec<Vec<NodeIndex>> — nodes per block (in order)
|
||||
/// - block_of_node: HashMap<NodeIndex, usize> — node → block index
|
||||
/// - block_succs: Vec<Vec<usize>> — successors per block
|
||||
/// - block_preds: Vec<Vec<usize>> — predecessors per block
|
||||
/// - blocks_nodes: Vec<Vec<NodeIndex>>, nodes per block (in order)
|
||||
/// - block_of_node: HashMap<NodeIndex, usize>, node → block index
|
||||
/// - block_succs: Vec<Vec<usize>>, successors per block
|
||||
/// - block_preds: Vec<Vec<usize>>, predecessors per block
|
||||
fn form_blocks(
|
||||
cfg: &Cfg,
|
||||
entry: NodeIndex,
|
||||
|
|
@ -537,7 +537,7 @@ fn form_blocks(
|
|||
// Discover leaders in BFS order over `cfg`, but skip edges whose
|
||||
// source is a terminating (Return / Throw) node. Walking the raw
|
||||
// `cfg` directly here would re-introduce the bookkeeping
|
||||
// Return/Throw → fn_exit edges we just stripped — fn_exit (or any
|
||||
// Return/Throw → fn_exit edges we just stripped, fn_exit (or any
|
||||
// post-return join) would be discovered through them and assigned a
|
||||
// block ID before its true block-level predecessors, breaking the
|
||||
// BFS-forward-pred invariant (`debug_assert_bfs_ordering`).
|
||||
|
|
@ -546,7 +546,7 @@ fn form_blocks(
|
|||
// exception edges entirely (collect_reachable strips them and records
|
||||
// them separately in `exception_edges`). Catch-block nodes are still
|
||||
// in `reachable` and must be discoverable as leaders via the
|
||||
// try-body → catch path — only the terminating-source bookkeeping
|
||||
// try-body → catch path, only the terminating-source bookkeeping
|
||||
// edges are bogus.
|
||||
{
|
||||
let mut bfs_queue: VecDeque<NodeIndex> = VecDeque::new();
|
||||
|
|
@ -572,7 +572,7 @@ fn form_blocks(
|
|||
// Belt-and-braces: any leader still unvisited gets appended in
|
||||
// CFG-node-index order so block-ID assignment remains
|
||||
// deterministic. We do NOT include the synthetic function-exit
|
||||
// node when it is unreachable through filtered edges — that
|
||||
// node when it is unreachable through filtered edges, that
|
||||
// happens whenever every path in the body terminates explicitly
|
||||
// (e.g. a function whose only return is `return buf.toString()`
|
||||
// at the tail). Including it would emit an orphan SSA block
|
||||
|
|
@ -760,19 +760,19 @@ pub(crate) fn is_receiver_name(name: &str) -> bool {
|
|||
/// on to emit one [`SsaOp::SelfParam`] (for the leading receiver slot, when
|
||||
/// present) followed by a contiguous run of [`SsaOp::Param { index }`] values
|
||||
/// whose indices 0..N correspond exactly to positional call-site argument
|
||||
/// positions — no receiver offset required anywhere downstream.
|
||||
/// positions, no receiver offset required anywhere downstream.
|
||||
///
|
||||
/// W1.b: every formal parameter gets a Param op even when the body never
|
||||
/// references it directly. Without this, the *first* `obj.f = rhs` on a
|
||||
/// formal `obj` whose body never reads `obj` produces no W1
|
||||
/// `field_writes` entry — `var_stacks["obj"]` is empty when the synth
|
||||
/// `field_writes` entry, `var_stacks["obj"]` is empty when the synth
|
||||
/// Assign runs because no external-use path interned `obj`. Subsequent
|
||||
/// writes work because the synth Assign itself defines `obj`, so the
|
||||
/// gap is exactly the FIRST write. Always emitting a formal Param at
|
||||
/// block 0 closes that gap.
|
||||
fn reorder_external_vars(external: Vec<String>, formal_params: &[String]) -> Vec<String> {
|
||||
if formal_params.is_empty() {
|
||||
return external; // no reordering — preserve existing alphabetical sort
|
||||
return external; // no reordering, preserve existing alphabetical sort
|
||||
}
|
||||
let ext_set: HashSet<&str> = external.iter().map(|s| s.as_str()).collect();
|
||||
let formal_set: HashSet<&str> = formal_params.iter().map(|s| s.as_str()).collect();
|
||||
|
|
@ -789,7 +789,7 @@ fn reorder_external_vars(external: Vec<String>, formal_params: &[String]) -> Vec
|
|||
}
|
||||
// Formal positional params next (declaration order), skipping any
|
||||
// receiver that was already emitted above. W1.b: include EVERY
|
||||
// formal regardless of whether the body uses it externally — an
|
||||
// formal regardless of whether the body uses it externally, an
|
||||
// unused formal that gets field-written via `obj.cache = rhs` still
|
||||
// needs a Param op so the synth Assign loop sees its prior reaching
|
||||
// def in `var_stacks`.
|
||||
|
|
@ -865,7 +865,7 @@ fn collect_var_defs(
|
|||
/// Returns a `BTreeSet<String>` per block so downstream consumers that iterate
|
||||
/// the set (notably `rename_variables`) observe a deterministic, alphabetical
|
||||
/// order regardless of the underlying hasher state. The Cytron algorithm
|
||||
/// itself is order-independent — only its observers are.
|
||||
/// itself is order-independent, only its observers are.
|
||||
fn insert_phis(
|
||||
var_defs: &BTreeMap<String, HashSet<usize>>,
|
||||
dom_frontiers: &[HashSet<usize>],
|
||||
|
|
@ -882,7 +882,7 @@ fn insert_phis(
|
|||
for &f in &dom_frontiers[b] {
|
||||
if has_phi.insert(f) {
|
||||
phi_placements[f].insert(var.clone());
|
||||
// Phi is a new definition — add to worklist
|
||||
// Phi is a new definition, add to worklist
|
||||
if !def_blocks.contains(&f) {
|
||||
worklist.push_back(f);
|
||||
}
|
||||
|
|
@ -945,7 +945,7 @@ fn rename_variables(
|
|||
// empty otherwise so existing per-statement Call lowering is
|
||||
// bit-for-bit unchanged.
|
||||
let mut field_interner = crate::ssa::ir::FieldInterner::new();
|
||||
// Pointer-Phase 3 / W1: side-table mapping each synthetic base-update
|
||||
//side-table mapping each synthetic base-update
|
||||
// [`SsaOp::Assign`]'s defined value to its `(receiver, field)` pair.
|
||||
// Populated below at the synthetic-Assign emission site. Read by
|
||||
// the taint engine to lift the assign into a structural field WRITE.
|
||||
|
|
@ -968,7 +968,7 @@ fn rename_variables(
|
|||
|
||||
// `BTreeMap` guarantees a deterministic (alphabetical) iteration order when
|
||||
// pushing phi values onto `var_stacks` and when filling operands on
|
||||
// successor phis — both sites are observable in SSA numbering if they
|
||||
// successor phis, both sites are observable in SSA numbering if they
|
||||
// reordered between runs.
|
||||
let mut phi_values: Vec<BTreeMap<String, SsaValue>> = vec![BTreeMap::new(); num_blocks];
|
||||
|
||||
|
|
@ -1118,14 +1118,14 @@ fn rename_variables(
|
|||
.any(|l| matches!(l, crate::labels::DataLabel::Source(_)))
|
||||
&& info.call.callee.is_none()
|
||||
{
|
||||
// Pure source (e.g. $_GET, env var) — no callee, so no args to track.
|
||||
// Pure source (e.g. $_GET, env var), no callee, so no args to track.
|
||||
// Source-labeled calls (e.g. file_get_contents) fall through to Call
|
||||
// so argument taint and sink detection still work.
|
||||
SsaOp::Source
|
||||
} else if info.call.callee.is_some() {
|
||||
let callee = info.call.callee.as_deref().unwrap_or("").to_string();
|
||||
let (mut args, mut receiver) = build_call_args(info, var_stacks);
|
||||
// Phase 2: try decomposing chained-receiver method calls
|
||||
// try decomposing chained-receiver method calls
|
||||
// (`a.b.c()`) into a FieldProj chain plus a bare-method Call
|
||||
// so downstream consumers can read the receiver structure
|
||||
// without re-parsing the callee text. Bails to None on any
|
||||
|
|
@ -1145,7 +1145,7 @@ fn rename_variables(
|
|||
Some((recv_v, bare_method)) => {
|
||||
receiver = Some(recv_v);
|
||||
// Strip any positional arg group that exactly matches the
|
||||
// chain root identifier — it has been replaced by the
|
||||
// chain root identifier, it has been replaced by the
|
||||
// FieldProj chain receiver, and re-listing it as an
|
||||
// argument would inflate arity / double-taint.
|
||||
if let Some(base_ident) = callee.split('.').next() {
|
||||
|
|
@ -1175,7 +1175,7 @@ fn rename_variables(
|
|||
// Reassignment kill: a node that defines a variable but has no
|
||||
// uses (operands) and is not a source is a constant/literal
|
||||
// assignment. SSA rename allocates a fresh SsaValue, so
|
||||
// downstream references see this new (untainted) value — the
|
||||
// downstream references see this new (untainted) value, the
|
||||
// prior tainted definition is implicitly dead.
|
||||
SsaOp::Const(info.taint.const_text.clone())
|
||||
} else if info.taint.defines.is_some() {
|
||||
|
|
@ -1217,12 +1217,12 @@ fn rename_variables(
|
|||
// `Assign(uses)` so the SSA carries an explicit pass-through
|
||||
// for the returned/thrown value. Without this, the Return
|
||||
// node was lowered as a `Nop` and the terminator-setup
|
||||
// "last non-Nop body inst" search returned None — producing
|
||||
// "last non-Nop body inst" search returned None, producing
|
||||
// `Terminator::Return(None)` for a function that visibly
|
||||
// returns an identifier. That broke per-return-path
|
||||
// PathFact narrowing for non-Rust languages where the
|
||||
// returned identifier wasn't computed in the same block
|
||||
// (e.g. Python `def f(s): return s` — `s` is a Param in
|
||||
// (e.g. Python `def f(s): return s`, `s` is a Param in
|
||||
// block 0, the Return block itself has no body insts).
|
||||
let uses: SmallVec<[SsaValue; 4]> = info
|
||||
.taint
|
||||
|
|
@ -1250,8 +1250,8 @@ fn rename_variables(
|
|||
} else if info.call.callee.is_some() {
|
||||
let callee = info.call.callee.as_deref().unwrap_or("").to_string();
|
||||
let (mut args, mut receiver) = build_call_args(info, var_stacks);
|
||||
// Phase 2: same FieldProj-chain decomposition as the primary
|
||||
// Call branch above — kept in sync because this fallback
|
||||
// same FieldProj-chain decomposition as the primary
|
||||
// Call branch above, kept in sync because this fallback
|
||||
// path also constructs SSA Call ops (used for control-flow
|
||||
// wrapper calls that landed past the earlier match arms).
|
||||
let (final_callee, callee_text) = match try_lower_field_proj_chain(
|
||||
|
|
@ -1342,9 +1342,9 @@ fn rename_variables(
|
|||
// overwrites properly kill taint: if obj.data is re-assigned to a
|
||||
// constant, the base `obj` no longer carries that field's taint.
|
||||
//
|
||||
// Pointer-Phase 3 / W1: each synthetic Assign also records its
|
||||
// structural identity into `field_writes` — `(receiver_old_value,
|
||||
// FieldId(field_name))` — so the taint engine can recognise the
|
||||
//each synthetic Assign also records its
|
||||
// structural identity into `field_writes`, `(receiver_old_value,
|
||||
// FieldId(field_name))`, so the taint engine can recognise the
|
||||
// synthetic assign as a field WRITE and mirror the rhs taint
|
||||
// into the matching `(loc, field)` cell on `SsaTaintState`.
|
||||
// The "old" parent value is the reaching def of `parent` BEFORE
|
||||
|
|
@ -1427,9 +1427,9 @@ fn rename_variables(
|
|||
|
||||
ssa_blocks[block_idx].terminator = if succs.is_empty() {
|
||||
// A block with no successors at the block level is one of:
|
||||
// (1) a block containing a Throw — terminates with an
|
||||
// (1) a block containing a Throw, terminates with an
|
||||
// exception; no normal fall-through.
|
||||
// (2) a block containing a Return — terminates with a value
|
||||
// (2) a block containing a Return, terminates with a value
|
||||
// (or void). After form_blocks strips the bookkeeping
|
||||
// Seq edge from Return → fn_exit, every explicit-return
|
||||
// block lands here, including `if cond { return X; }`
|
||||
|
|
@ -1458,7 +1458,7 @@ fn rename_variables(
|
|||
let return_info = &cfg[rn];
|
||||
// Return-value resolution. Mirror the legacy
|
||||
// `has_const_return` path so callers see exactly the same
|
||||
// SSA shape they did before the merged-return fix — only
|
||||
// SSA shape they did before the merged-return fix, only
|
||||
// the *terminator* changes (Goto(exit) → Return(_)), not
|
||||
// the value selection.
|
||||
//
|
||||
|
|
@ -1468,7 +1468,7 @@ fn rename_variables(
|
|||
// Emit a synthetic Const inst so taint never leaks
|
||||
// from an unrelated inst earlier in the same block
|
||||
// (regression guard: C-1 inline-return precision).
|
||||
// (b) Computed / passthrough return — last non-Nop body
|
||||
// (b) Computed / passthrough return, last non-Nop body
|
||||
// inst. Covers `return foo()` (Call sits before the
|
||||
// Return Nop), `return x + y` (Assign), and the
|
||||
// implicit tail expression collapsed into a single
|
||||
|
|
@ -1476,9 +1476,9 @@ fn rename_variables(
|
|||
// Return carries identifier uses (`return req`,
|
||||
// `return { req.session, ... }`), the SSA defs for
|
||||
// those identifiers are already on the body as
|
||||
// Param / Assign / Source insts — picking the last
|
||||
// Param / Assign / Source insts, picking the last
|
||||
// one matches pre-fix behaviour exactly.
|
||||
// (c) Void / unresolved — `Return(None)`.
|
||||
// (c) Void / unresolved, `Return(None)`.
|
||||
if return_info.taint.uses.is_empty() {
|
||||
let const_text = return_info.taint.const_text.clone();
|
||||
let const_v = SsaValue(*next_value);
|
||||
|
|
@ -1507,7 +1507,7 @@ fn rename_variables(
|
|||
Terminator::Return(from_body)
|
||||
}
|
||||
} else {
|
||||
// (3) fn_exit / true fall-off — no Return CFG node in this
|
||||
// (3) fn_exit / true fall-off, no Return CFG node in this
|
||||
// block. Use the last non-Nop body instruction as the
|
||||
// implicit return value (e.g. the function's tail-position
|
||||
// expression in Rust).
|
||||
|
|
@ -1575,7 +1575,7 @@ fn rename_variables(
|
|||
condition,
|
||||
}
|
||||
} else {
|
||||
// More than 2 successors — model as a multi-way Switch.
|
||||
// More than 2 successors, model as a multi-way Switch.
|
||||
//
|
||||
// This replaces the previous `Goto(first)` collapse: the
|
||||
// structured terminator now enumerates every target instead
|
||||
|
|
@ -1594,7 +1594,7 @@ fn rename_variables(
|
|||
//
|
||||
// Scrutinee: use the primary SSA value defined at the last
|
||||
// node in this block when one exists; fall back to
|
||||
// `SsaValue(0)` (a valid index — SSA numbering is 1-based
|
||||
// `SsaValue(0)` (a valid index, SSA numbering is 1-based
|
||||
// only conceptually, and value 0 is always present in a
|
||||
// non-empty body) when no value is defined. Downstream
|
||||
// consumers that care about the scrutinee (abstract interp,
|
||||
|
|
@ -1604,7 +1604,7 @@ fn rename_variables(
|
|||
let targets: SmallVec<[BlockId; 4]> =
|
||||
succs.iter().skip(1).map(|&s| BlockId(s as u32)).collect();
|
||||
let default = BlockId(succs[0] as u32);
|
||||
// Synthetic ≥3-way fanouts have no per-case literal metadata —
|
||||
// Synthetic ≥3-way fanouts have no per-case literal metadata ,
|
||||
// every entry is None (unknown), so the executor falls back to
|
||||
// first-reachable behavior on this terminator.
|
||||
let case_values: SmallVec<[Option<crate::constraint::domain::ConstValue>; 4]> =
|
||||
|
|
@ -1815,7 +1815,7 @@ fn debug_assert_bfs_ordering(block_preds: &[Vec<usize>]) {
|
|||
/// predecessor of the block.
|
||||
///
|
||||
/// Runs in release builds because phi-operand mismatches are
|
||||
/// load-bearing for soundness — downstream taint, const, and abstract
|
||||
/// load-bearing for soundness, downstream taint, const, and abstract
|
||||
/// analyses iterate phi operands by `(pred_blk, value)` pairs, and
|
||||
/// either a missing operand (silent "no contribution" on that edge)
|
||||
/// or a phantom operand (garbage into the join) corrupts analysis
|
||||
|
|
@ -1824,7 +1824,7 @@ fn debug_assert_bfs_ordering(block_preds: &[Vec<usize>]) {
|
|||
/// The invariant is strict equality. Predecessors that carry no
|
||||
/// reaching definition for the phi's variable are filled with the
|
||||
/// [`SsaOp::Undef`] sentinel in `fill_undef_phi_operands`, rather than
|
||||
/// being dropped — so consumers that look up by `(pred_blk, value)`
|
||||
/// being dropped, so consumers that look up by `(pred_blk, value)`
|
||||
/// see a real operand for every control-flow edge.
|
||||
fn assert_phi_operand_counts(ssa_blocks: &[SsaBlock], block_preds: &[Vec<usize>]) {
|
||||
use std::collections::HashSet;
|
||||
|
|
@ -1887,7 +1887,7 @@ fn assert_phi_operand_counts(ssa_blocks: &[SsaBlock], block_preds: &[Vec<usize>]
|
|||
/// single shared sentinel instruction ([`SsaOp::Undef`]) synthesized
|
||||
/// at the end of block 0's body. Consumers iterate phi operands by
|
||||
/// `(pred_blk, value)` and therefore see a real operand on every
|
||||
/// control-flow edge — no implicit "missing = empty" semantics.
|
||||
/// control-flow edge, no implicit "missing = empty" semantics.
|
||||
///
|
||||
/// The Undef instruction is created lazily (only when at least one phi
|
||||
/// has a gap) so functions with fully-dominating definitions pay zero
|
||||
|
|
@ -1931,7 +1931,7 @@ fn fill_undef_phi_operands(
|
|||
block: BlockId(0),
|
||||
});
|
||||
// Place the Undef instruction at the end of block 0's body so it
|
||||
// appears after any synthetic Param / SelfParam emissions — its
|
||||
// appears after any synthetic Param / SelfParam emissions, its
|
||||
// only role is to anchor the SsaValue; ordering relative to other
|
||||
// body instructions is cosmetic (no consumer depends on its
|
||||
// position, only on the value lookup).
|
||||
|
|
@ -2181,7 +2181,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn bfs_ordering_holds_for_linear_cfg() {
|
||||
// Entry → A → B → Exit — all blocks should satisfy BFS ordering
|
||||
// Entry → A → B → Exit, all blocks should satisfy BFS ordering
|
||||
let mut cfg: Cfg = Graph::new();
|
||||
let entry = cfg.add_node(make_node(StmtKind::Entry));
|
||||
let a = cfg.add_node(NodeInfo {
|
||||
|
|
@ -2409,7 +2409,7 @@ mod tests {
|
|||
/// predecessor and a normal control-flow predecessor must lower to a
|
||||
/// consistent phi. For variables defined before the try (live on
|
||||
/// *both* edges), the phi at the catch block has exactly two operands
|
||||
/// — one per predecessor — and the release assertion accepts it.
|
||||
///, one per predecessor, and the release assertion accepts it.
|
||||
#[test]
|
||||
fn catch_block_join_phi_has_operand_per_live_predecessor() {
|
||||
// Entry → defines `x` → Try → (Seq) → Join ← (Exception via body) Catch
|
||||
|
|
@ -2456,7 +2456,7 @@ mod tests {
|
|||
cfg.add_edge(catch, join, EdgeKind::Seq);
|
||||
cfg.add_edge(join, exit, EdgeKind::Seq);
|
||||
|
||||
// Lowering must succeed — the assertion is active in release.
|
||||
// Lowering must succeed, the assertion is active in release.
|
||||
let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap();
|
||||
|
||||
// Locate the block containing a phi for `x`; it must be the join
|
||||
|
|
@ -2498,7 +2498,7 @@ mod tests {
|
|||
/// Regression guard for the Undef fill pass. When a variable is
|
||||
/// only defined on one branch of a join (e.g. a catch-only binding
|
||||
/// rejoining the normal path), the lowering must still emit one
|
||||
/// phi operand per predecessor — the missing edge becoming a
|
||||
/// phi operand per predecessor, the missing edge becoming a
|
||||
/// reference to the synthesized `SsaOp::Undef` sentinel rather
|
||||
/// than being dropped.
|
||||
#[test]
|
||||
|
|
@ -2633,7 +2633,7 @@ mod tests {
|
|||
#[should_panic(expected = "SSA phi operand count does not match predecessor count")]
|
||||
fn phi_assertion_helper_rejects_more_operands_than_preds() {
|
||||
// A phi with MORE operands than preds references a nonexistent
|
||||
// predecessor — unsound because downstream consumers either
|
||||
// predecessor, unsound because downstream consumers either
|
||||
// panic on the lookup or silently feed garbage taint into the
|
||||
// join. Strict-equality invariant catches this.
|
||||
let dummy_node = NodeIndex::new(0);
|
||||
|
|
@ -2859,7 +2859,7 @@ mod tests {
|
|||
/// to a synthetic exit block. Previously, the bookkeeping
|
||||
/// `Return → fn_exit` `Seq` edge made early-return blocks fall into
|
||||
/// the single-successor `Goto` arm, and the fall-through tail
|
||||
/// expression's body got merged into the shared exit block — every
|
||||
/// expression's body got merged into the shared exit block, every
|
||||
/// early-return path therefore appeared to also execute the tail.
|
||||
/// Mirrors the `if cond { return X; } Y` shape that motivated the fix.
|
||||
#[test]
|
||||
|
|
@ -2876,7 +2876,7 @@ mod tests {
|
|||
});
|
||||
// True branch: return constant. uses=[] + const_text=Some triggers
|
||||
// the literal-return path, ensuring the block emits a synthetic
|
||||
// Const + Return(Some(_)) — the same shape `return None` /
|
||||
// Const + Return(Some(_)), the same shape `return None` /
|
||||
// `return String::new()` produces in real Rust code.
|
||||
let early_ret = cfg.add_node(NodeInfo {
|
||||
taint: TaintMeta {
|
||||
|
|
@ -2901,7 +2901,7 @@ mod tests {
|
|||
cfg.add_edge(if_node, early_ret, EdgeKind::True);
|
||||
cfg.add_edge(if_node, tail, EdgeKind::False);
|
||||
// Bookkeeping wire-up the real CFG construction performs in
|
||||
// `build_cfg` — Return / Throw → fn_exit via Seq — so the SSA
|
||||
// `build_cfg`, Return / Throw → fn_exit via Seq, so the SSA
|
||||
// lowering has to handle it.
|
||||
cfg.add_edge(early_ret, exit, EdgeKind::Seq);
|
||||
cfg.add_edge(tail, exit, EdgeKind::Seq);
|
||||
|
|
@ -2909,7 +2909,7 @@ mod tests {
|
|||
let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap();
|
||||
|
||||
// Locate the block containing the early-return CFG node and
|
||||
// assert it terminates with Return — not Goto(_) into the
|
||||
// assert it terminates with Return, not Goto(_) into the
|
||||
// shared exit block.
|
||||
let early_block = ssa
|
||||
.blocks
|
||||
|
|
@ -2936,7 +2936,7 @@ mod tests {
|
|||
// The fall-through (tail) block must NOT have the early-return
|
||||
// block as a predecessor. Pre-fix, both the early-return path
|
||||
// and the tail path merged into the shared fn_exit block, so the
|
||||
// tail's body was reachable from the early-return path — that's
|
||||
// tail's body was reachable from the early-return path, that's
|
||||
// the merged-return defect.
|
||||
let tail_block = ssa
|
||||
.blocks
|
||||
|
|
@ -2963,7 +2963,7 @@ mod tests {
|
|||
/// `if a || b || c { return X; } Y` must have its rejection body emit a
|
||||
/// `Terminator::Return(_)` and have `succs.is_empty()`. Pre-fix the
|
||||
/// rejection body's String::new() Call shared a block whose only
|
||||
/// successor was the merged tail — losing the early-return semantics
|
||||
/// successor was the merged tail, losing the early-return semantics
|
||||
/// entirely and diluting per-return-path PathFact narrowing.
|
||||
#[test]
|
||||
fn or_chain_rejection_block_terminates_with_return() {
|
||||
|
|
@ -3093,7 +3093,7 @@ mod tests {
|
|||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
// Phase 2: FieldProj chain lowering tests
|
||||
// FieldProj chain lowering tests
|
||||
// ─────────────────────────────────────────────────────────────────
|
||||
//
|
||||
// These tests pin the contract that `try_lower_field_proj_chain`
|
||||
|
|
@ -3426,7 +3426,7 @@ mod tests {
|
|||
assert!(blocks[0].body.is_empty());
|
||||
}
|
||||
|
||||
// ── End-to-end Phase 2 tests via real tree-sitter parsing ──────────
|
||||
// ── End-to-end SSA decomposition tests via real tree-sitter parsing ──────────
|
||||
//
|
||||
// These exercise the integration between CFG construction (which sets
|
||||
// `info.call.callee = "c.mu.Lock"`) and SSA lowering. We assert that
|
||||
|
|
@ -3451,7 +3451,7 @@ mod tests {
|
|||
};
|
||||
// Mirror the production lowering path: function bodies use
|
||||
// lower_to_ssa_with_params so formal parameters get synthetic
|
||||
// Param/SelfParam injections at block 0 — without them, the
|
||||
// Param/SelfParam injections at block 0, without them, the
|
||||
// FieldProj chain helper has no SSA root to anchor to.
|
||||
if body.meta.name.is_some() {
|
||||
let func_name = body.meta.name.clone().unwrap_or_default();
|
||||
|
|
@ -3506,7 +3506,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn phase2_e2e_go_chained_receiver_emits_field_proj() {
|
||||
// Go: `c.writer.header.set(k, v)` — 3-segment receiver, 2 FieldProjs.
|
||||
// Go: `c.writer.header.set(k, v)`, 3-segment receiver, 2 FieldProjs.
|
||||
// Chain root `c` is a function parameter so it is resolvable.
|
||||
let src = b"package p\nfunc f(c *T, k string, v string) { c.writer.header.set(k, v) }\n";
|
||||
let body = parse_to_first_body(
|
||||
|
|
@ -3549,7 +3549,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn phase2_e2e_python_chained_receiver_emits_field_proj() {
|
||||
// Python: `obj.client.session.send(p)` — 3-segment receiver.
|
||||
// Python: `obj.client.session.send(p)`, 3-segment receiver.
|
||||
let src = b"def f(obj, p):\n obj.client.session.send(p)\n";
|
||||
let body = parse_to_first_body(
|
||||
src,
|
||||
|
|
@ -3574,7 +3574,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn phase2_e2e_javascript_chained_receiver_emits_field_proj() {
|
||||
// JS: `obj.foo.bar.baz()` — 3-segment receiver.
|
||||
// JS: `obj.foo.bar.baz()`, 3-segment receiver.
|
||||
let src = b"function f(obj) { obj.foo.bar.baz(); }";
|
||||
let body = parse_to_first_body(
|
||||
src,
|
||||
|
|
@ -3592,10 +3592,10 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn phase2_e2e_java_chained_receiver_emits_field_proj() {
|
||||
// Java: `obj.config.handler.run()` — 3-segment receiver chain through
|
||||
// Java: `obj.config.handler.run()`, 3-segment receiver chain through
|
||||
// a parameter `obj`. We avoid `this.…` because `this` is a Java
|
||||
// keyword (not an identifier_node) so it isn't extracted as an
|
||||
// external use — outside Phase 2's scope.
|
||||
// external use, outside SSA decomposition.s scope.
|
||||
let src = b"class C { void f(Object obj) { obj.config.handler.run(); } }";
|
||||
let body = parse_to_first_body(
|
||||
src,
|
||||
|
|
@ -3620,7 +3620,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn phase2_e2e_simple_receiver_no_field_proj() {
|
||||
// REGRESSION: `obj.foo()` — single-dot receiver. Phase 2 must NOT
|
||||
// REGRESSION: `obj.foo()`, single-dot receiver. SSA lowering must NOT
|
||||
// decompose this into a FieldProj chain (existing receiver channel
|
||||
// already covers it). Verify the body has zero FieldProj ops and
|
||||
// the Call's callee_text stays None.
|
||||
|
|
@ -3664,7 +3664,7 @@ mod tests {
|
|||
fn phase2_e2e_global_root_chain_still_emits_field_proj() {
|
||||
// REGRESSION-NEGATIVE: when the chain root is a global identifier
|
||||
// (`Math.foo.bar()`), the lowerer's external-var synthesis makes
|
||||
// `Math` available as a synthetic Param — the chain still
|
||||
// `Math` available as a synthetic Param, the chain still
|
||||
// decomposes, treating `Math` as the SSA receiver. This is the
|
||||
// semantically correct outcome even for global-rooted chains: the
|
||||
// FieldProj op precisely captures the field-access structure.
|
||||
|
|
@ -3685,7 +3685,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn phase2_e2e_rust_method_call_through_field_emits_field_proj() {
|
||||
// Rust: `c.mu.lock()` — `c` is a function parameter, `mu` is a field,
|
||||
// Rust: `c.mu.lock()`, `c` is a function parameter, `mu` is a field,
|
||||
// `lock` is the method. Verifies we generate FieldProj for `mu`.
|
||||
// (Rust paths like `std::env::var` use `::` and are excluded by
|
||||
// the helper's complex-token check.)
|
||||
|
|
@ -3782,16 +3782,11 @@ mod tests {
|
|||
);
|
||||
}
|
||||
|
||||
/// Pointer-Phase 3 / W1 end-to-end: lowering an `obj.f = rhs`
|
||||
/// statement populates `SsaBody.field_writes` with the synthetic
|
||||
/// base-update Assign's `(receiver, FieldId)` mapping.
|
||||
///
|
||||
/// W1.b: a SINGLE-write shape — `function f(obj) { obj.cache = 42 }`
|
||||
/// — also populates `field_writes` because every formal gets a
|
||||
/// Param op at block 0 regardless of whether it's read by the
|
||||
/// body. Pre-W1.b this required two writes (the second's prior
|
||||
/// reaching def came from the first synth Assign); now the first
|
||||
/// write already finds the formal's Param in `var_stacks`.
|
||||
/// End-to-end: lowering an `obj.f = rhs` statement populates
|
||||
/// `SsaBody.field_writes` with the synthetic base-update Assign's
|
||||
/// `(receiver, FieldId)` mapping. A single-write shape suffices ,
|
||||
/// every formal gets a Param op at block 0 so the first write
|
||||
/// finds the formal in `var_stacks`.
|
||||
#[test]
|
||||
fn w1_end_to_end_field_write_records_side_table_when_parent_has_prior_def() {
|
||||
// Single write to `obj.cache`: the formal `obj` provides the
|
||||
|
|
@ -3816,7 +3811,7 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
/// W1.b: Python — single `obj.cache = 42` on a formal also
|
||||
/// W1.b: Python, single `obj.cache = 42` on a formal also
|
||||
/// populates `field_writes` thanks to the formal Param op.
|
||||
#[test]
|
||||
fn w1b_single_write_records_field_write_python() {
|
||||
|
|
@ -3835,7 +3830,7 @@ mod tests {
|
|||
);
|
||||
}
|
||||
|
||||
/// W1.b: Rust — single `obj.cache = 42` on a method-style formal
|
||||
/// W1.b: Rust, single `obj.cache = 42` on a method-style formal
|
||||
/// (`fn f(obj: &mut O)`) also populates `field_writes`.
|
||||
#[test]
|
||||
fn w1b_single_write_records_field_write_rust() {
|
||||
|
|
@ -3880,11 +3875,11 @@ mod tests {
|
|||
// ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Loop induction variable: `x = x + 1` inside a loop is the
|
||||
/// canonical SSA challenge — the body uses `x` then redefines it,
|
||||
/// canonical SSA challenge, the body uses `x` then redefines it,
|
||||
/// and the join with the entry definition must produce a phi that
|
||||
/// distinguishes the entry value from the body's redefinition.
|
||||
/// Phase 5.2 (induction var pruning) depends on this shape being
|
||||
/// lowered correctly.
|
||||
/// Induction-var pruning depends on this shape being lowered
|
||||
/// correctly.
|
||||
#[test]
|
||||
fn loop_self_assignment_induction_phi_is_distinct() {
|
||||
// Entry → x=0 → Loop header → [Body: use x; x = x_new] → Loop
|
||||
|
|
@ -4101,7 +4096,7 @@ mod tests {
|
|||
|
||||
/// Variable defined ONLY in one branch of a conditional must be
|
||||
/// undef on the other path. The phi at the join should include an
|
||||
/// undef sentinel for the missing arm — guards against the
|
||||
/// undef sentinel for the missing arm, guards against the
|
||||
/// renamer silently dropping the missing operand.
|
||||
#[test]
|
||||
fn conditional_define_only_one_arm_phi_has_undef_operand() {
|
||||
|
|
@ -4137,7 +4132,7 @@ mod tests {
|
|||
|
||||
// Find a phi for x and verify it has 2 operands. The "undef"
|
||||
// operand can manifest as a Nop-defined SsaValue or a sentinel
|
||||
// — both are acceptable; the invariant is that arity == preds.
|
||||
//, both are acceptable; the invariant is that arity == preds.
|
||||
let x_phi_ops = ssa
|
||||
.blocks
|
||||
.iter()
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
#[allow(dead_code)] // IR types — fields used by Display impl, tests, and downstream analyses
|
||||
#[allow(dead_code)] // IR types, fields used by Display impl, tests, and downstream analyses
|
||||
pub mod alias;
|
||||
pub mod const_prop;
|
||||
pub mod copy_prop;
|
||||
|
|
|
|||
|
|
@ -6,13 +6,13 @@
|
|||
//! 1. **Param → Param field writes.** An `obj.field = val` where `obj`
|
||||
//! traces back to parameter `b` and `val` traces back to parameter `a`
|
||||
//! emits a `Param(a) → Param(b)` `MayAlias` edge. This captures the
|
||||
//! `mutating_helper` pattern — the callee mutates a shared heap cell
|
||||
//! `mutating_helper` pattern, the callee mutates a shared heap cell
|
||||
//! through one parameter and the caller observes the mutation through
|
||||
//! its argument for that parameter.
|
||||
//!
|
||||
//! 2. **Param → Return aliases.** `Terminator::Return(v)` where `v`
|
||||
//! traces back to a parameter emits a `Param(i) → Return` edge. This
|
||||
//! captures the `returned_alias` pattern — the callee returns its
|
||||
//! captures the `returned_alias` pattern, the callee returns its
|
||||
//! argument unchanged and the caller treats the result as aliasing the
|
||||
//! input.
|
||||
//!
|
||||
|
|
@ -25,7 +25,7 @@
|
|||
//!
|
||||
//! The analysis is **flow-insensitive** and **bounded**: it does not
|
||||
//! reason about path feasibility, and it stops adding edges once the
|
||||
//! summary's [`MAX_ALIAS_EDGES`] cap is reached — the overflow flag is
|
||||
//! summary's [`MAX_ALIAS_EDGES`] cap is reached, the overflow flag is
|
||||
//! the conservative fallback that callers honour.
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
|
@ -39,7 +39,7 @@ use super::ir::{SsaBody, SsaOp, SsaValue, Terminator};
|
|||
|
||||
/// Map an SSA value back to its defining instruction's op.
|
||||
///
|
||||
/// Local to this module — the taint engine has its own `build_inst_map`
|
||||
/// Local to this module, the taint engine has its own `build_inst_map`
|
||||
/// that also carries receiver info we do not need, and duplicating it
|
||||
/// keeps this analysis independent of that private helper's shape.
|
||||
fn build_op_map(ssa: &SsaBody) -> HashMap<SsaValue, SsaOp> {
|
||||
|
|
@ -73,7 +73,7 @@ struct ParamHit {
|
|||
/// The `SsaOp::Param` index as lowered.
|
||||
ssa_index: usize,
|
||||
/// The parameter's variable name (from [`SsaInst::var_name`]). Used
|
||||
/// to map back to the formal-declaration position — the caller's
|
||||
/// to map back to the formal-declaration position, the caller's
|
||||
/// `args[i]` slot is keyed by declaration position, not by SSA
|
||||
/// index, and the two can disagree when a formal parameter is
|
||||
/// skipped from SSA lowering (e.g., pure-output params).
|
||||
|
|
@ -83,7 +83,7 @@ struct ParamHit {
|
|||
/// Walk Assign/Phi chains to find a backing `Param { index }` SSA op.
|
||||
///
|
||||
/// Returns the `SsaOp::Param`'s index *and* its var_name so callers can
|
||||
/// resolve the formal-positional index via the name lookup table — the
|
||||
/// resolve the formal-positional index via the name lookup table, the
|
||||
/// two indices can disagree when SSA lowering skips a formal parameter
|
||||
/// (never used as a read), shifting subsequent param indices down.
|
||||
fn trace_to_param_hit(
|
||||
|
|
@ -144,7 +144,7 @@ fn param_hit_to_formal_index(hit: &ParamHit, params_by_name: &HashMap<String, us
|
|||
/// * `"obj.list[2].name"` → `"obj"`
|
||||
///
|
||||
/// Used to decide whether a field-style Assign's LHS base names a
|
||||
/// parameter variable — we strip everything after the first separator
|
||||
/// parameter variable, we strip everything after the first separator
|
||||
/// and compare the remainder to the recorded param names.
|
||||
fn base_of_path(name: &str) -> &str {
|
||||
let dot = name.find('.');
|
||||
|
|
@ -170,7 +170,7 @@ fn is_receiver_name_local(name: &str) -> bool {
|
|||
/// Returns `true` the first time a qualifying allocation is found.
|
||||
/// Parameter-terminated paths, `Call` ops that are not container
|
||||
/// constructors, and constants that are not container literals all
|
||||
/// return `false` — soundly under-approximating, since the caller will
|
||||
/// return `false`, soundly under-approximating, since the caller will
|
||||
/// simply fall back to the existing `Param(i) → Return` / store-into-
|
||||
/// heap channels when the flag is absent.
|
||||
fn trace_to_fresh_alloc(
|
||||
|
|
@ -225,7 +225,7 @@ fn returns_fresh_allocation(
|
|||
///
|
||||
/// `param_info` carries one `(param_index, param_name, param_ssa_value)`
|
||||
/// tuple per formal parameter that was emitted as [`SsaOp::Param`] in the
|
||||
/// lowered body. The receiver is intentionally excluded — this table
|
||||
/// lowered body. The receiver is intentionally excluded, this table
|
||||
/// captures positional parameters only.
|
||||
///
|
||||
/// `formal_param_names`, when supplied, is the authoritative list of
|
||||
|
|
@ -261,7 +261,7 @@ pub fn analyse_param_points_to(
|
|||
// container constructor for `lang` (`ArrayList`, `dict`, …).
|
||||
//
|
||||
// When at least one return path matches, the callee produces a
|
||||
// caller-visible fresh heap identity on that path — callers
|
||||
// caller-visible fresh heap identity on that path, callers
|
||||
// synthesise a `HeapObjectId` keyed on the call result so later
|
||||
// container operations have a stable heap cell. Traces that reach a
|
||||
// parameter are handled by the edge-based `Param(i) → Return` channel
|
||||
|
|
@ -278,7 +278,7 @@ pub fn analyse_param_points_to(
|
|||
return summary;
|
||||
}
|
||||
// Build the name→positional-index map. Summary param indices are
|
||||
// *positional* — they match the call-site `args[i]` position, which
|
||||
// *positional*, they match the call-site `args[i]` position, which
|
||||
// excludes the receiver (`self`/`this`). When `formal_param_names`
|
||||
// contains a leading receiver, skip it so the remaining names align
|
||||
// with the SSA `SsaOp::Param { index }` convention.
|
||||
|
|
@ -344,7 +344,7 @@ pub fn analyse_param_points_to(
|
|||
continue;
|
||||
}
|
||||
if src_idx == target_idx {
|
||||
// Self-alias is uninformative — the caller's
|
||||
// Self-alias is uninformative, the caller's
|
||||
// arg-to-itself propagation is already covered by
|
||||
// `param_to_return`/`param_to_sink`.
|
||||
continue;
|
||||
|
|
@ -532,7 +532,7 @@ mod tests {
|
|||
(5usize, "capture".to_string(), SsaValue(0)),
|
||||
(1usize, "b".to_string(), SsaValue(1)),
|
||||
];
|
||||
// formal_param_count = 2 — index 5 is out of range.
|
||||
// formal_param_count = 2, index 5 is out of range.
|
||||
let s = analyse_param_points_to(&body, &pinfo, 2, None, None);
|
||||
assert!(
|
||||
s.is_empty(),
|
||||
|
|
@ -570,7 +570,7 @@ mod tests {
|
|||
.map(|i| (i, format!("p{i}"), SsaValue(i as u32)))
|
||||
.collect();
|
||||
// Only the first traced param is emitted (trace_to_param short-
|
||||
// circuits on first match), so overflow is not expected — we
|
||||
// circuits on first match), so overflow is not expected, we
|
||||
// instead verify the bounded behaviour: a single edge.
|
||||
let s = analyse_param_points_to(&body, &pinfo, n as usize, None, None);
|
||||
assert!(!s.overflow);
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ use smallvec::SmallVec;
|
|||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub enum ContainerOp {
|
||||
/// Taint flows from the listed argument positions into the receiver
|
||||
/// container (e.g. `arr.push(val)` — val taint merges into arr).
|
||||
/// container (e.g. `arr.push(val)`, val taint merges into arr).
|
||||
///
|
||||
/// `index_arg`: when `Some(pos)`, the argument at that logical position
|
||||
/// is the container index/key. If constant-propagation proves it a
|
||||
|
|
@ -27,11 +27,11 @@ pub enum ContainerOp {
|
|||
/// Taint flows from the receiver container to the call's return value
|
||||
/// (e.g. `arr.pop()`, `items.join('')`).
|
||||
///
|
||||
/// `index_arg`: same semantics as `Store::index_arg` — when present and
|
||||
/// `index_arg`: same semantics as `Store::index_arg`, when present and
|
||||
/// provably constant, loads from `HeapSlot::Index(n)`.
|
||||
Load { index_arg: Option<usize> },
|
||||
/// Taint flows from the receiver container into the argument at
|
||||
/// `dest_arg` — i.e. the "writeback" pattern where a method writes its
|
||||
/// `dest_arg`, i.e. the "writeback" pattern where a method writes its
|
||||
/// decoded/loaded value into a caller-supplied destination rather than
|
||||
/// returning it. Used for the Go `*.Decode(&dest)` family
|
||||
/// (`json.Decoder.Decode`, `xml.Decoder.Decode`, `gob.Decoder.Decode`),
|
||||
|
|
@ -121,16 +121,16 @@ fn classify_js(method: &str) -> Option<ContainerOp> {
|
|||
match method {
|
||||
// Array store
|
||||
"push" | "unshift" => store(0),
|
||||
// Map/Set store: map.set(key, value) — key at 0, value at 1
|
||||
// Map/Set store: map.set(key, value), key at 0, value at 1
|
||||
"set" => store_indexed(1, 0),
|
||||
"add" => store(0), // set.add(value)
|
||||
// Array/Map load
|
||||
"pop" | "shift" => load(),
|
||||
"join" | "flat" | "concat" | "slice" | "toString" => load(),
|
||||
// map.get(key) — key at 0
|
||||
// map.get(key), key at 0
|
||||
"get" => load_indexed(0),
|
||||
"values" | "keys" | "entries" => load(),
|
||||
// Pointer-Phase 6 / W5: synthetic callees emitted by CFG
|
||||
//synthetic callees emitted by CFG
|
||||
// lowering for subscript reads/writes (`arr[i]`, `arr[i] = v`).
|
||||
"__index_get__" => load_indexed(0),
|
||||
"__index_set__" => store_indexed(1, 0),
|
||||
|
|
@ -142,7 +142,7 @@ fn classify_python(method: &str) -> Option<ContainerOp> {
|
|||
match method {
|
||||
// List store
|
||||
"append" | "extend" => store(0),
|
||||
"insert" => store_indexed(1, 0), // list.insert(index, value) — index at 0, value at 1
|
||||
"insert" => store_indexed(1, 0), // list.insert(index, value), index at 0, value at 1
|
||||
// Set store
|
||||
"add" => store(0),
|
||||
// Dict store
|
||||
|
|
@ -150,10 +150,10 @@ fn classify_python(method: &str) -> Option<ContainerOp> {
|
|||
"setdefault" => store2(0, 1), // dict.setdefault(key, default)
|
||||
// List/Dict load
|
||||
"pop" => load(),
|
||||
"get" => load_indexed(0), // dict.get(key) / list index — key/index at 0
|
||||
"get" => load_indexed(0), // dict.get(key) / list index, key/index at 0
|
||||
"items" | "values" | "keys" => load(),
|
||||
"join" => load(),
|
||||
// Pointer-Phase 6 / W5: synthetic callees emitted by CFG
|
||||
//synthetic callees emitted by CFG
|
||||
// lowering for subscript reads/writes (`arr[i]`, `arr[i] = v`).
|
||||
"__index_get__" => load_indexed(0),
|
||||
"__index_set__" => store_indexed(1, 0),
|
||||
|
|
@ -165,11 +165,11 @@ fn classify_java(method: &str) -> Option<ContainerOp> {
|
|||
match method {
|
||||
// Collection store
|
||||
"add" | "addAll" | "putAll" | "offer" | "push" => store(0),
|
||||
// ArrayList.set(index, value) — index at 0, value at 1
|
||||
// ArrayList.set(index, value), index at 0, value at 1
|
||||
"set" => store_indexed(1, 0),
|
||||
// Map.put(key, value) — key at 0, value at 1
|
||||
// Map.put(key, value), key at 0, value at 1
|
||||
"put" => store_indexed(1, 0),
|
||||
// Collection load: ArrayList.get(index) — index at 0
|
||||
// Collection load: ArrayList.get(index), index at 0
|
||||
"get" => load_indexed(0),
|
||||
"poll" | "peek" | "remove" | "pop" => load(),
|
||||
"stream" | "toArray" | "iterator" => load(),
|
||||
|
|
@ -203,7 +203,7 @@ fn classify_go(method: &str, callee: &str) -> Option<ContainerOp> {
|
|||
// method-call form has the bytes carried via the receiver, not arg 0,
|
||||
// so it lines up with the writeback contract just like `Decode`.
|
||||
"Decode" | "Unmarshal" => Some(ContainerOp::Writeback { dest_arg: 0 }),
|
||||
// Pointer-Phase 6 / W5: synthetic callees emitted by CFG
|
||||
//synthetic callees emitted by CFG
|
||||
// lowering for Go index_expression reads/writes (`arr[i]`,
|
||||
// `m[k] = v`).
|
||||
"__index_get__" => load_indexed(0),
|
||||
|
|
@ -222,7 +222,7 @@ fn classify_ruby(method: &str) -> Option<ContainerOp> {
|
|||
|
||||
fn classify_php(method: &str) -> Option<ContainerOp> {
|
||||
match method {
|
||||
"array_push" => store(1), // array_push(&$arr, $val) — arr is arg 0, val is arg 1
|
||||
"array_push" => store(1), // array_push(&$arr, $val), arr is arg 0, val is arg 1
|
||||
"array_pop" | "array_shift" | "current" | "next" | "reset" => load(),
|
||||
_ => None,
|
||||
}
|
||||
|
|
@ -232,11 +232,11 @@ fn classify_cpp(method: &str) -> Option<ContainerOp> {
|
|||
match method {
|
||||
// Mutating container operations.
|
||||
// `assign` overwrites the container's contents with the argument
|
||||
// sequence — modeled as Store so the receiver inherits the argument
|
||||
// sequence, modeled as Store so the receiver inherits the argument
|
||||
// taint, matching the runtime "the values now live inside this
|
||||
// container" semantics shared with `push_back`/`emplace_back`.
|
||||
"push_back" | "emplace_back" | "insert" | "emplace" | "push" | "assign" => store(0),
|
||||
// Map/unordered_map insertion: `m.insert_or_assign(k, v)` — value at 1.
|
||||
// Map/unordered_map insertion: `m.insert_or_assign(k, v)`, value at 1.
|
||||
"insert_or_assign" => store_indexed(1, 0),
|
||||
// Read-only container observers. `find`/`count` return iterators or
|
||||
// counts that carry the container's value taint when queried with a
|
||||
|
|
@ -255,7 +255,7 @@ fn classify_rust(method: &str) -> Option<ContainerOp> {
|
|||
match method {
|
||||
"push" | "insert" | "extend" => store(0),
|
||||
"pop" | "first" | "last" | "iter" | "remove" => load(),
|
||||
// vec.get(index) — index at 0
|
||||
// vec.get(index), index at 0
|
||||
"get" => load_indexed(0),
|
||||
_ => None,
|
||||
}
|
||||
|
|
@ -304,7 +304,7 @@ mod tests {
|
|||
}
|
||||
|
||||
// CVE Hunt Session 2 (Owncast CVE-2023-3188 / CVE-2024-31450 family):
|
||||
// Go `*.Decode(&dest)` is the canonical streaming-decoder writeback —
|
||||
// Go `*.Decode(&dest)` is the canonical streaming-decoder writeback ,
|
||||
// `json.NewDecoder(r.Body).Decode(&dest)`, `xml.NewDecoder(r).Decode(&out)`,
|
||||
// `gob.NewDecoder(buf).Decode(&v)`. The decoder receiver carries the
|
||||
// source taint and the destination is arg 0; the writeback rule is the
|
||||
|
|
@ -394,7 +394,7 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
// ── C++ Phase 1 additions ──────────────────────────────────────
|
||||
// ── C++ extras ──────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn cpp_push_back_is_store() {
|
||||
|
|
@ -413,7 +413,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn cpp_assign_is_store() {
|
||||
// vector::assign(args) overwrites the container's contents — the
|
||||
// vector::assign(args) overwrites the container's contents, the
|
||||
// receiver inherits argument taint just like push_back.
|
||||
let op = classify_container_op("v.assign", Lang::Cpp);
|
||||
assert!(matches!(op, Some(ContainerOp::Store { .. })));
|
||||
|
|
@ -421,7 +421,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn cpp_insert_or_assign_indexes_value() {
|
||||
// map::insert_or_assign(key, value) — value is at arg 1, key at arg 0.
|
||||
// map::insert_or_assign(key, value), value is at arg 1, key at arg 0.
|
||||
match classify_container_op("m.insert_or_assign", Lang::Cpp) {
|
||||
Some(ContainerOp::Store {
|
||||
value_args,
|
||||
|
|
@ -456,7 +456,7 @@ mod tests {
|
|||
}
|
||||
|
||||
/// W5: synthetic `__index_get__` is recognised as an indexed load
|
||||
/// in JS/TS, Python, and Go — driving the index_arg=0 path so a
|
||||
/// in JS/TS, Python, and Go, driving the index_arg=0 path so a
|
||||
/// constant-key subscript read flows through `HeapSlot::Index(n)`.
|
||||
#[test]
|
||||
fn synth_index_get_classified_as_indexed_load_js_py_go() {
|
||||
|
|
@ -471,7 +471,7 @@ mod tests {
|
|||
}
|
||||
|
||||
/// W5: synthetic `__index_set__` is recognised as an indexed store
|
||||
/// in JS/TS, Python, and Go — value at arg 1, index at arg 0.
|
||||
/// in JS/TS, Python, and Go, value at arg 1, index at arg 0.
|
||||
#[test]
|
||||
fn synth_index_set_classified_as_indexed_store_js_py_go() {
|
||||
for lang in [Lang::JavaScript, Lang::TypeScript, Lang::Python, Lang::Go] {
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@
|
|||
//! where every insert's *value* slot is a syntactic string literal and the
|
||||
//! final lookup is dereffed via a literal fallback (`.unwrap_or(LIT)`). The
|
||||
//! result `cmd` is then provably bounded to the finite set
|
||||
//! `{V1, V2, …, "safe"}`, regardless of what `k` carries — taint-flavour or
|
||||
//! `{V1, V2, …, "safe"}`, regardless of what `k` carries, taint-flavour or
|
||||
//! otherwise. Downstream sink suppression consumes this finite set to
|
||||
//! clear SHELL/FILE/SQL injection findings whose payload is proved to be
|
||||
//! metacharacter-free.
|
||||
|
|
@ -24,7 +24,7 @@
|
|||
//! (e.g. `"table.get(key).copied().unwrap_or"` for `table.get(key).copied()
|
||||
//! .unwrap_or("safe")`) and whose `receiver` is the root identifier's SSA
|
||||
//! value. We therefore do not need to walk SSA `.copied()` / `.unwrap_or`
|
||||
//! instructions as separate hops — pattern-matching on the callee text is
|
||||
//! instructions as separate hops, pattern-matching on the callee text is
|
||||
//! the source of truth. String-literal arguments that the callee text
|
||||
//! elides (e.g. the fallback `"safe"`) are read from the CFG node's
|
||||
//! `arg_string_literals`, populated during CFG construction.
|
||||
|
|
@ -33,7 +33,7 @@
|
|||
//! literal-valued inserts, no escape beyond recognised mutate/read methods.
|
||||
//! Any deviation (dynamic insert, callee not in the allow-list, map used as
|
||||
//! a plain argument, map returned, map joined across a phi) invalidates the
|
||||
//! candidate. Missed detection is safe — it just falls through to existing
|
||||
//! candidate. Missed detection is safe, it just falls through to existing
|
||||
//! behaviour.
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
|
@ -73,15 +73,15 @@ fn is_rust_map_constructor(callee: &str) -> bool {
|
|||
/// Classification of a Call whose receiver is a candidate map.
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
enum MapUse {
|
||||
/// `{var}.insert(K, V)` — value contributes to the finite domain.
|
||||
/// `{var}.insert(K, V)`, value contributes to the finite domain.
|
||||
Insert,
|
||||
/// `{var}.get(K)[.copied()|.cloned()|.as_deref()|.as_ref()]*.unwrap_or`
|
||||
/// — lookup result is bounded by the inserted values plus the fallback
|
||||
///, lookup result is bounded by the inserted values plus the fallback
|
||||
/// literal on the CFG node.
|
||||
StaticLookup,
|
||||
/// Whitelisted read-only method (no reference leak).
|
||||
ReadOnly,
|
||||
/// Anything else — invalidates the map candidate.
|
||||
/// Anything else, invalidates the map candidate.
|
||||
Escape,
|
||||
}
|
||||
|
||||
|
|
@ -138,7 +138,7 @@ fn scan_past_balanced_parens(s: &str) -> Option<&str> {
|
|||
/// Return `true` when `s` is a sequence of zero or more identity chain
|
||||
/// methods (`.copied()`, `.cloned()`, `.as_deref()`, `.as_ref()`) followed
|
||||
/// by `.unwrap_or` (and nothing else). The trailing arg list of
|
||||
/// `.unwrap_or` is elided in the callee text — it appears in the CFG node's
|
||||
/// `.unwrap_or` is elided in the callee text, it appears in the CFG node's
|
||||
/// `arg_string_literals` instead.
|
||||
fn is_identity_chain_ending_in_unwrap_or(mut s: &str) -> bool {
|
||||
const IDENTS: &[&str] = &[".copied()", ".cloned()", ".as_deref()", ".as_ref()"];
|
||||
|
|
@ -171,7 +171,7 @@ fn resolve_alias(v: SsaValue, aliases: &HashMap<SsaValue, SsaValue>) -> SsaValue
|
|||
cur
|
||||
}
|
||||
|
||||
/// Run the analysis. Bails out immediately for non-Rust bodies — the current
|
||||
/// Run the analysis. Bails out immediately for non-Rust bodies, the current
|
||||
/// pattern set only models Rust `std::collections::HashMap`.
|
||||
pub fn analyze(
|
||||
body: &SsaBody,
|
||||
|
|
@ -382,7 +382,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn classify_static_lookup_without_identity_chain() {
|
||||
// `.unwrap_or` directly after `.get(...)` also qualifies — Rust
|
||||
// `.unwrap_or` directly after `.get(...)` also qualifies, Rust
|
||||
// `HashMap::get` returns `Option<&V>`, so `.unwrap_or(&"safe")` is
|
||||
// syntactically valid and equally bounded.
|
||||
assert_eq!(
|
||||
|
|
@ -401,7 +401,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn classify_rejects_unknown_terminator() {
|
||||
// `.unwrap_or_else(|| …)` is not modelled — closure can return anything.
|
||||
// `.unwrap_or_else(|| …)` is not modelled, closure can return anything.
|
||||
assert_eq!(
|
||||
classify_map_use("t.get(k).copied().unwrap_or_else", "t"),
|
||||
MapUse::Escape
|
||||
|
|
@ -414,7 +414,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn classify_rejects_other_receiver() {
|
||||
// `other.insert` does not belong to `table` — receiver mismatch.
|
||||
// `other.insert` does not belong to `table`, receiver mismatch.
|
||||
assert_eq!(classify_map_use("other.insert", "table"), MapUse::Escape);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -25,23 +25,21 @@ pub enum TypeKind {
|
|||
FileHandle,
|
||||
Url,
|
||||
HttpClient,
|
||||
/// A local, in-memory collection (HashMap, HashSet, Vec,
|
||||
/// BTreeMap, …). Consumed by the auth analysis sink gate so method
|
||||
/// calls on variables of this type (`map.insert(...)`) are treated
|
||||
/// as in-memory bookkeeping rather than cross-tenant sinks. Has no
|
||||
/// `label_prefix` — it never participates in label-based callee
|
||||
/// A local, in-memory collection (HashMap, HashSet, Vec, etc.).
|
||||
/// The auth sink gate uses this so calls like `map.insert(...)`
|
||||
/// are treated as bookkeeping rather than cross-tenant sinks. No
|
||||
/// `label_prefix`, never participates in label-based callee
|
||||
/// resolution.
|
||||
LocalCollection,
|
||||
/// Phase 6: a framework-injected DTO body whose field types are
|
||||
/// known. Populated only when a parameter is recognised as a typed
|
||||
/// extractor by a Phase 1-2 matcher AND the DTO class / struct /
|
||||
/// Pydantic model is resolvable in the current scan scope.
|
||||
/// Strictly additive — when no DTO definition is found, callers
|
||||
/// fall through to today's pre-Phase-6 behaviour.
|
||||
/// A framework-injected DTO body whose field types are known.
|
||||
/// Populated when a parameter is recognised as a typed extractor and
|
||||
/// the DTO class / struct / Pydantic model is resolvable in scope.
|
||||
/// Strictly additive, without a DTO definition, callers fall back
|
||||
/// to name-only resolution.
|
||||
Dto(DtoFields),
|
||||
}
|
||||
|
||||
/// Phase 6: structural carrier for a recognised DTO type. Maps
|
||||
/// structural carrier for a recognised DTO type. Maps
|
||||
/// declared field names to their inferred [`TypeKind`]. Nested DTOs
|
||||
/// use [`TypeKind::Dto`] recursively.
|
||||
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
|
||||
|
|
@ -82,19 +80,11 @@ impl TypeKind {
|
|||
}
|
||||
}
|
||||
|
||||
/// Container name used by the typed call-graph devirtualisation
|
||||
/// (`docs/typed-call-graph-prompt.md`, Phase 2).
|
||||
///
|
||||
/// Returns the class / impl / module string under which an SSA
|
||||
/// receiver value of this type would be looked up in
|
||||
/// [`crate::callgraph::ClassMethodIndex`]. Mirrors
|
||||
/// [`Self::label_prefix`] for the security-relevant abstract
|
||||
/// types (HttpClient → `"HttpClient"`, DatabaseConnection →
|
||||
/// `"DatabaseConnection"`, etc.) and additionally returns the DTO
|
||||
/// class name for [`TypeKind::Dto`] receivers.
|
||||
///
|
||||
/// Scalar / unknown types return `None` — they have no defining
|
||||
/// container and would not narrow a method-call edge meaningfully.
|
||||
/// Container name used by typed call-graph devirtualisation ,
|
||||
/// the class / impl / module under which a receiver of this type
|
||||
/// would be looked up. Returns the DTO class name for `Dto`
|
||||
/// receivers, label prefixes for known abstract types, `None` for
|
||||
/// scalars.
|
||||
pub fn container_name(&self) -> Option<String> {
|
||||
if let Some(prefix) = self.label_prefix() {
|
||||
return Some(prefix.to_string());
|
||||
|
|
@ -105,7 +95,7 @@ impl TypeKind {
|
|||
None
|
||||
}
|
||||
|
||||
/// Phase 6: convenience accessor for the inner `DtoFields` if this
|
||||
/// convenience accessor for the inner `DtoFields` if this
|
||||
/// type is a recognised DTO.
|
||||
pub fn as_dto(&self) -> Option<&DtoFields> {
|
||||
match self {
|
||||
|
|
@ -146,7 +136,7 @@ impl TypeFact {
|
|||
TypeFact { kind, nullable }
|
||||
}
|
||||
|
||||
/// Phase 6: factory used by the field-access propagation rule.
|
||||
/// factory used by the field-access propagation rule.
|
||||
pub(crate) fn from_dto_field(receiver: &TypeKind, field: &str) -> Option<Self> {
|
||||
let dto = receiver.as_dto()?;
|
||||
let kind = dto.get(field)?.clone();
|
||||
|
|
@ -190,10 +180,10 @@ impl TypeFactResult {
|
|||
///
|
||||
/// Suppression policy:
|
||||
/// * [`TypeKind::Int`] (and float, treated as numeric): suppresses
|
||||
/// `SQL_QUERY`, `FILE_IO`, `SHELL_ESCAPE`, `HTML_ESCAPE`, `SSRF` —
|
||||
/// `SQL_QUERY`, `FILE_IO`, `SHELL_ESCAPE`, `HTML_ESCAPE`, `SSRF` ,
|
||||
/// numeric values cannot carry the metacharacters required to drive
|
||||
/// any of these injection classes.
|
||||
/// * [`TypeKind::Bool`]: suppresses every type-suppressible bit —
|
||||
/// * [`TypeKind::Bool`]: suppresses every type-suppressible bit ,
|
||||
/// `true`/`false` cannot carry a payload of any kind.
|
||||
pub fn is_type_safe_for_sink(
|
||||
values: &[SsaValue],
|
||||
|
|
@ -245,6 +235,18 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option<TypeKind> {
|
|||
Lang::JavaScript | Lang::TypeScript => match suffix {
|
||||
"URL" => Some(TypeKind::Url),
|
||||
"Request" | "XMLHttpRequest" => Some(TypeKind::HttpClient),
|
||||
// JS built-in collection constructors. `new Map()` / `new Set()`
|
||||
// / `new WeakMap()` / `new WeakSet()` / `new Array()` produce
|
||||
// in-memory collections; downstream `m.get(k)` / `m.set(k, v)`
|
||||
// / `s.add(x)` / `s.has(x)` / `arr.find(p)` are container ops,
|
||||
// not data-layer reads. Without this mapping the bare verb
|
||||
// dispatch in `auth_analysis::config::classify_sink_class`
|
||||
// matches the `get` / `find` / `add` read/mutation indicators
|
||||
// and over-fires `js.auth.missing_ownership_check` on every
|
||||
// Map lookup in pure data-manipulation code (excalidraw's
|
||||
// `elementsMap.get(id)`, `origIdToDuplicateId.get(...)`,
|
||||
// `groupIdMapForOperation.set(...)` shapes).
|
||||
"Map" | "Set" | "WeakMap" | "WeakSet" | "Array" => Some(TypeKind::LocalCollection),
|
||||
_ => None,
|
||||
},
|
||||
Lang::Python => {
|
||||
|
|
@ -334,10 +336,9 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option<TypeKind> {
|
|||
Some(TypeKind::DatabaseConnection)
|
||||
} else if is_rust_local_collection_constructor(base) {
|
||||
// Rust std/indexmap/smallvec/dashmap collection
|
||||
// constructors map to a generic "local collection" type so
|
||||
// the auth analysis sink gate can recognise
|
||||
// `let x = factory_fn(); x.insert(..)` even when the RHS
|
||||
// isn't a syntactic constructor call.
|
||||
// constructors map to a generic "local collection" type
|
||||
// so the auth sink gate recognises
|
||||
// `let x = factory_fn(); x.insert(..)`.
|
||||
Some(TypeKind::LocalCollection)
|
||||
} else {
|
||||
None
|
||||
|
|
@ -421,6 +422,15 @@ fn is_rust_local_collection_constructor(base: &str) -> bool {
|
|||
"FxHashSet",
|
||||
"DashMap",
|
||||
"DashSet",
|
||||
// `roaring` crate, RoaringBitmap / RoaringTreemap are
|
||||
// in-memory bitset / bitmap containers (set-of-u32 /
|
||||
// set-of-u64). Used heavily by indexing systems
|
||||
// (meilisearch's index-scheduler) for `task_ids`,
|
||||
// `docids`, and similar local-collection bookkeeping.
|
||||
// Mutations (`insert` / `remove` / `clear`) are container
|
||||
// ops, not data-layer writes.
|
||||
"RoaringBitmap",
|
||||
"RoaringTreemap",
|
||||
];
|
||||
const VERBS: &[&str] = &[
|
||||
"new",
|
||||
|
|
@ -460,11 +470,73 @@ pub fn is_int_producing_callee(callee: &str) -> bool {
|
|||
| "Atoi" | "ParseInt" | "ParseFloat" // Go
|
||||
| "intval" | "floatval" // PHP
|
||||
| "to_i" | "to_f" // Ruby
|
||||
| "parse" // Rust: `.parse::<N>()` / `.parse().unwrap()` — conservative
|
||||
| "parse" // Rust: `.parse::<N>()` / `.parse().unwrap()`, conservative
|
||||
// (most Rust .parse() calls target numeric types)
|
||||
)
|
||||
}
|
||||
|
||||
/// Polarity hint for a generic input-validator callee.
|
||||
///
|
||||
/// Most validation idioms route attacker-controlled input through a
|
||||
/// helper whose result the caller branches on:
|
||||
///
|
||||
/// ```text
|
||||
/// const err = validateUrlSsrf(child.webhookUrl); // ErrorReturning
|
||||
/// if (err) throw new Error(err); // false branch → success
|
||||
///
|
||||
/// if (isValid(input)) { use(input); } // BooleanTrueIsValid
|
||||
/// // true branch → success
|
||||
/// ```
|
||||
///
|
||||
/// Without modeling this pattern, a one-statement rewrite of a
|
||||
/// `validate(x); if(x) ...` guard hides the semantic equivalence to
|
||||
/// `if (validate(x)) ...` (already classified as ValidationCall). The
|
||||
/// classifier discriminates only on the textual head of the bare call
|
||||
///, strict-additive: callees that don't match any pattern return
|
||||
/// `None` and the engine falls through to its existing behaviour.
|
||||
///
|
||||
/// Motivated by Novu CVE GHSA-4x48-cgf9-q33f
|
||||
/// (`const ssrfError = await validateUrlSsrf(child.webhookUrl); if (ssrfError) throw`).
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
pub enum InputValidatorPolarity {
|
||||
/// Returns boolean, truthy means "valid".
|
||||
BooleanTrueIsValid,
|
||||
/// Returns null/undefined on success, error/message on failure ,
|
||||
/// truthy means "rejected".
|
||||
ErrorReturning,
|
||||
}
|
||||
|
||||
pub fn classify_input_validator_callee(callee: &str) -> Option<InputValidatorPolarity> {
|
||||
let base = peel_identity_suffix(callee);
|
||||
let suffix = base.rsplit(['.', ':']).next().unwrap_or(&base);
|
||||
let lower = suffix.to_ascii_lowercase();
|
||||
|
||||
// Boolean returners, name typically reads as a predicate
|
||||
// (`isValid…`, `is_valid_…`, `is_safe…`, `has_valid…`). Truthy
|
||||
// result → input is valid → TRUE branch carries the validation.
|
||||
if lower.starts_with("isvalid")
|
||||
|| lower.starts_with("is_valid")
|
||||
|| lower.starts_with("issafe")
|
||||
|| lower.starts_with("is_safe")
|
||||
|| lower.starts_with("hasvalid")
|
||||
|| lower.starts_with("has_valid")
|
||||
{
|
||||
return Some(InputValidatorPolarity::BooleanTrueIsValid);
|
||||
}
|
||||
|
||||
// Error-returning validators, name reads as a verb whose return
|
||||
// value carries the error description. `validateXxx`, `verifyXxx`
|
||||
// are the dominant idioms; we deliberately do NOT match `check…`
|
||||
// here because a name like `checkPermissions` overlaps with auth
|
||||
// checks (different semantic) and the suppression payoff isn't
|
||||
// worth the precision risk.
|
||||
if lower.starts_with("validate") || lower.starts_with("verify") {
|
||||
return Some(InputValidatorPolarity::ErrorReturning);
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
/// Analyze types for all SSA values.
|
||||
///
|
||||
/// Uses constant propagation results to seed types from known constants,
|
||||
|
|
@ -571,7 +643,7 @@ pub fn analyze_types_with_param_types(
|
|||
| BinOp::Gt
|
||||
| BinOp::GtEq,
|
||||
) => TypeFact::from_kind(TypeKind::Int),
|
||||
// Add could be string concatenation — defer to operand types
|
||||
// Add could be string concatenation, defer to operand types
|
||||
_ => TypeFact::unknown(),
|
||||
}
|
||||
}
|
||||
|
|
@ -587,7 +659,7 @@ pub fn analyze_types_with_param_types(
|
|||
Some(tk) => TypeFact::from_kind(tk.clone()),
|
||||
None => TypeFact::unknown(),
|
||||
},
|
||||
// Undef contributes no type information — phi joins
|
||||
// Undef contributes no type information, phi joins
|
||||
// pick up the type from the other (defined) operand.
|
||||
SsaOp::Undef => TypeFact::unknown(),
|
||||
};
|
||||
|
|
@ -603,7 +675,7 @@ pub fn analyze_types_with_param_types(
|
|||
|
||||
for block in &body.blocks {
|
||||
// Identity-preserving method calls: pass through receiver's type.
|
||||
// E.g. `Connection::open(p).unwrap()` — the `.unwrap()` call's type
|
||||
// E.g. `Connection::open(p).unwrap()`, the `.unwrap()` call's type
|
||||
// fact should mirror the receiver (Result<Connection>). Only applies
|
||||
// when the current fact is still Unknown so explicit constructor
|
||||
// mappings win.
|
||||
|
|
@ -618,7 +690,7 @@ pub fn analyze_types_with_param_types(
|
|||
continue;
|
||||
}
|
||||
// A numeric-length accessor pinned by the first pass is
|
||||
// load-bearing for sink suppression — do not let identity-
|
||||
// load-bearing for sink suppression, do not let identity-
|
||||
// method receiver propagation overwrite the Int fact.
|
||||
if cfg
|
||||
.node_weight(inst.cfg_node)
|
||||
|
|
@ -644,7 +716,7 @@ pub fn analyze_types_with_param_types(
|
|||
}
|
||||
}
|
||||
|
||||
// Phase 6.3: FieldProj receiver-driven type narrowing. When
|
||||
// FieldProj receiver-driven type narrowing. When
|
||||
// SSA lowering decomposed `a.b.c()` into a FieldProj chain,
|
||||
// intermediate FieldProj insts default to `projected_type =
|
||||
// None`. If the receiver value carries a Dto fact and the
|
||||
|
|
@ -701,7 +773,7 @@ pub fn analyze_types_with_param_types(
|
|||
// Copy assignments and binary arithmetic
|
||||
for inst in &block.body {
|
||||
// Preserve the Int fact pinned by the numeric-length-access
|
||||
// detector in the first pass — copy propagation would replace
|
||||
// detector in the first pass, copy propagation would replace
|
||||
// it with the receiver's (usually Unknown) type and defeat the
|
||||
// whole point of the accessor rule.
|
||||
if cfg
|
||||
|
|
@ -712,11 +784,11 @@ pub fn analyze_types_with_param_types(
|
|||
}
|
||||
if let SsaOp::Assign(uses) = &inst.op {
|
||||
if uses.len() == 1 {
|
||||
// Phase 6.3: when the RHS is a single member-access
|
||||
// when the RHS is a single member-access
|
||||
// expression and the receiver value carries a
|
||||
// `TypeKind::Dto(fields)` fact, route the assignment's
|
||||
// type to the field's declared `TypeKind`. Strictly
|
||||
// additive — falls through to copy-prop when the
|
||||
// additive, falls through to copy-prop when the
|
||||
// receiver isn't a DTO or the field isn't recorded.
|
||||
let dto_field_fact = cfg
|
||||
.node_weight(inst.cfg_node)
|
||||
|
|
@ -777,7 +849,7 @@ pub fn analyze_types_with_param_types(
|
|||
/// Used for `instanceof` resolution and type-qualified method dispatch.
|
||||
pub struct TypeHierarchy;
|
||||
|
||||
/// (subtype, &[supertypes]) — sink-relevant framework types only.
|
||||
/// (subtype, &[supertypes]), sink-relevant framework types only.
|
||||
static JAVA_HIERARCHY: &[(&str, &[&str])] = &[
|
||||
("HttpServletResponse", &["ServletResponse"]),
|
||||
("HttpServletRequest", &["ServletRequest"]),
|
||||
|
|
@ -853,7 +925,7 @@ impl TypeHierarchy {
|
|||
///
|
||||
/// Conservative: unknown interfaces → `true` (could satisfy).
|
||||
/// Only [`definitely_not`](GoInterfaceTable::definitely_not) is used for
|
||||
/// suppression — it returns `true` only when the type provably cannot
|
||||
/// suppression, it returns `true` only when the type provably cannot
|
||||
/// implement the interface.
|
||||
pub struct GoInterfaceTable;
|
||||
|
||||
|
|
@ -1147,8 +1219,8 @@ mod tests {
|
|||
assert_eq!(result.get_type(SsaValue(99)), None);
|
||||
}
|
||||
|
||||
/// Phase 4: Int-typed values must suppress every type-suppressible
|
||||
/// cap — including the freshly-added `SSRF` bit. Numeric IDs
|
||||
/// Int-typed values must suppress every type-suppressible
|
||||
/// cap, including the freshly-added `SSRF` bit. Numeric IDs
|
||||
/// cannot rewrite a URL host, cannot form path traversal sequences,
|
||||
/// cannot carry SQL/HTML/shell metacharacters.
|
||||
#[test]
|
||||
|
|
@ -1183,7 +1255,7 @@ mod tests {
|
|||
));
|
||||
}
|
||||
|
||||
/// Phase 4: Bool-typed values are even safer than ints — `true` /
|
||||
/// Bool-typed values are even safer than ints, `true` /
|
||||
/// `false` cannot carry any payload and must suppress every
|
||||
/// type-suppressible cap.
|
||||
#[test]
|
||||
|
|
@ -1207,7 +1279,7 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
/// String-typed values must NOT trigger suppression — they are the
|
||||
/// String-typed values must NOT trigger suppression, they are the
|
||||
/// canonical injection carrier. Regression guard so a future
|
||||
/// change to `is_type_safe_for_sink` does not silently silence
|
||||
/// real String-payload findings.
|
||||
|
|
@ -1349,8 +1421,8 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
/// Audit A3 (companion): mixed-type operand list — only one Int
|
||||
/// among operands of unknown type — must NOT suppress. The
|
||||
/// Audit A3 (companion): mixed-type operand list, only one Int
|
||||
/// among operands of unknown type, must NOT suppress. The
|
||||
/// suppression rule requires every operand to be payload-incompatible.
|
||||
#[test]
|
||||
fn mixed_type_operands_do_not_suppress() {
|
||||
|
|
@ -1366,7 +1438,7 @@ mod tests {
|
|||
));
|
||||
}
|
||||
|
||||
/// Phase 3: Param values seeded from `param_types` must surface
|
||||
/// Param values seeded from `param_types` must surface
|
||||
/// the right TypeKind for downstream sink suppression. An out-of-
|
||||
/// range index falls back to Unknown (the pre-Phase-3 default).
|
||||
#[test]
|
||||
|
|
@ -1590,6 +1662,47 @@ mod tests {
|
|||
assert_eq!(constructor_type(Lang::Cpp, "printf"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn constructor_type_javascript_typescript_local_collections() {
|
||||
// `new Map()` / `new Set()` / `new WeakMap()` / `new WeakSet()` /
|
||||
// `new Array()` produce in-memory collections. Excalidraw's
|
||||
// `elementsMap.get(id)` shape (which dominates the
|
||||
// `js.auth.missing_ownership_check` cluster on JS data-manipulation
|
||||
// libraries) is suppressed once the receiver type is known.
|
||||
for lang in [Lang::JavaScript, Lang::TypeScript] {
|
||||
assert_eq!(
|
||||
constructor_type(lang, "Map"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
constructor_type(lang, "Set"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
constructor_type(lang, "WeakMap"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
constructor_type(lang, "WeakSet"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
assert_eq!(
|
||||
constructor_type(lang, "Array"),
|
||||
Some(TypeKind::LocalCollection)
|
||||
);
|
||||
// Existing pre-fix mappings still resolve.
|
||||
assert_eq!(constructor_type(lang, "URL"), Some(TypeKind::Url));
|
||||
assert_eq!(
|
||||
constructor_type(lang, "XMLHttpRequest"),
|
||||
Some(TypeKind::HttpClient)
|
||||
);
|
||||
// Negative: unrelated identifiers stay None.
|
||||
assert_eq!(constructor_type(lang, "Object"), None);
|
||||
assert_eq!(constructor_type(lang, "Promise"), None);
|
||||
assert_eq!(constructor_type(lang, "Foo"), None);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn constructor_type_ruby() {
|
||||
// HttpClient
|
||||
|
|
@ -1680,7 +1793,7 @@ mod tests {
|
|||
constructor_type(Lang::Rust, "diesel::SqliteConnection::establish"),
|
||||
Some(TypeKind::DatabaseConnection)
|
||||
);
|
||||
// Bare `Connection::open` is accepted — Rust idiom
|
||||
// Bare `Connection::open` is accepted, Rust idiom
|
||||
// `use rusqlite::Connection; Connection::open(…)` is common, and the
|
||||
// scanner sees the unqualified callee text after import resolution.
|
||||
// Accepting this matches the benchmark fixture `rs-sqli-001`.
|
||||
|
|
@ -1938,9 +2051,9 @@ mod tests {
|
|||
);
|
||||
}
|
||||
|
||||
// ── Phase 6 DTO field-level taint ─────────────────────────────────────
|
||||
// ── DTO field-level taint ─────────────────────────────────────────────
|
||||
|
||||
/// Phase 6: `TypeFact::from_dto_field` returns `Some(field_kind)`
|
||||
/// `TypeFact::from_dto_field` returns `Some(field_kind)`
|
||||
/// for a DTO receiver whose `fields` map contains the requested
|
||||
/// field, and `None` otherwise.
|
||||
#[test]
|
||||
|
|
@ -1956,7 +2069,7 @@ mod tests {
|
|||
assert!(TypeFact::from_dto_field(&recv, "missing").is_none());
|
||||
}
|
||||
|
||||
/// Phase 6: a non-DTO receiver kind never produces a field fact —
|
||||
/// a non-DTO receiver kind never produces a field fact ,
|
||||
/// `from_dto_field` falls through to the legacy copy-prop path.
|
||||
#[test]
|
||||
fn dto_field_lookup_on_non_dto_returns_none() {
|
||||
|
|
@ -1974,10 +2087,9 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
/// Phase 6: nested DTO — the parent DTO's field type is
|
||||
/// `TypeKind::Dto`, and `from_dto_field` returns that nested DTO
|
||||
/// fact directly. Phase 6.3 callers can recurse into the inner
|
||||
/// fields by following the returned receiver's `as_dto()` chain.
|
||||
/// Nested DTO, the parent DTO's field type is `TypeKind::Dto`,
|
||||
/// and `from_dto_field` returns that nested DTO fact directly.
|
||||
/// Callers can recurse via `as_dto()`.
|
||||
#[test]
|
||||
fn dto_field_lookup_supports_nested_dto() {
|
||||
let mut inner = DtoFields::new("Address");
|
||||
|
|
@ -1990,7 +2102,7 @@ mod tests {
|
|||
assert_eq!(addr.kind, TypeKind::Dto(inner));
|
||||
}
|
||||
|
||||
/// Phase 6: an empty DTO (class declared but with no inferred
|
||||
/// an empty DTO (class declared but with no inferred
|
||||
/// fields) never resolves field reads. Documents the safe-fallback
|
||||
/// invariant so the legacy path runs when class fields couldn't be
|
||||
/// classified.
|
||||
|
|
@ -2000,9 +2112,8 @@ mod tests {
|
|||
assert!(TypeFact::from_dto_field(&recv, "anything").is_none());
|
||||
}
|
||||
|
||||
/// Phase 6: an `Int`-typed field in a DTO survives the
|
||||
/// type-suppression matrix exactly the same way a freestanding
|
||||
/// `Int` does — sanity-check the bridge between Phase 6 and Phase 4.
|
||||
/// An `Int`-typed DTO field survives the type-suppression matrix
|
||||
/// the same way a freestanding `Int` does.
|
||||
#[test]
|
||||
fn dto_int_field_suppresses_sql_query_via_matrix() {
|
||||
use crate::labels::Cap;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue