Python fp and docs updtes (#58)

* refactor: Update comments for clarity and add expectations.json files for performance metrics

* feat: Implement FP guard for JS/TS local-collection receivers to suppress missing ownership checks

* feat: Enhance Rust parameter handling to classify local collections and prevent false ownership checks

* refactor: Simplify code formatting for better readability in multiple files

* refactor: Improve UTF-8 sequence length handling and enhance clarity in loop iteration

* feat: Update Java and Python patterns to include new security rules

* refactor: Improve comment clarity and consistency across multiple Rust files

* refactor: Simplify code formatting for improved readability in integration tests and module files

* refactor: Improve comment formatting and enhance clarity in assertions across multiple files
This commit is contained in:
Eli Peter 2026-04-29 19:53:34 -04:00 committed by GitHub
parent 4db0805de6
commit a438886217
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
291 changed files with 9485 additions and 3851 deletions

View file

@ -59,7 +59,7 @@ impl BaseAliasResult {
///
/// For each entry `(dst_val, src_val)` where copy prop replaced `dst` with
/// `src`, looks up the original variable names. If both are plain identifiers
/// (no dots i.e. not field paths), they are registered as base aliases.
/// (no dots, i.e. not field paths), they are registered as base aliases.
/// Transitive closure is computed so `b = a; c = b` yields group `{a, b, c}`.
pub fn compute_base_aliases(
copy_map: &HashMap<SsaValue, SsaValue>,
@ -103,7 +103,7 @@ pub fn compute_base_aliases(
let ra = find(parent, a);
let rb = find(parent, b);
if ra != rb {
// Arbitrary root choice alphabetically smaller becomes root
// Arbitrary root choice, alphabetically smaller becomes root
// for determinism.
if ra < rb {
parent.insert(rb, ra);
@ -130,7 +130,7 @@ pub fn compute_base_aliases(
None => continue,
};
// Only alias plain idents dotted paths (field accesses) are tracked
// Only alias plain idents, dotted paths (field accesses) are tracked
// independently in SSA and handled by field-aware suppression.
if dst_name.contains('.') || src_name.contains('.') {
continue;

View file

@ -17,7 +17,7 @@ pub enum ConstLattice {
Bool(bool),
/// Null / nil / None.
Null,
/// Multiple possible values not constant.
/// Multiple possible values, not constant.
Varying,
}
@ -70,7 +70,7 @@ impl ConstLattice {
return ConstLattice::Str(inner.to_string());
}
// Bare string (no quotes) treat as string constant
// Bare string (no quotes), treat as string constant
ConstLattice::Str(trimmed.to_string())
}
@ -283,7 +283,7 @@ fn eval_inst(inst: &SsaInst, values: &HashMap<SsaValue, ConstLattice>) -> ConstL
| SsaOp::SelfParam
| SsaOp::CatchParam => ConstLattice::Varying,
// FieldProj: projecting a field is dynamic with respect to the
// const-propagation lattice there is no general way to fold
// const-propagation lattice, there is no general way to fold
// `obj.field` to a known scalar at this phase. Returning Varying
// matches Call: callers needing field-level constness will go
// through the points-to / heap analysis.
@ -452,7 +452,7 @@ fn mark_edge_executable(
if executable_blocks.insert(to) {
cfg_worklist.push_back(to);
} else {
// Block already executable but new edge re-evaluate phis
// Block already executable but new edge, re-evaluate phis
cfg_worklist.push_back(to);
}
}
@ -863,7 +863,7 @@ mod tests {
/// Const parsing must round-trip integer signs. i64::MIN/MAX must
/// parse without overflow; arbitrary text falls back to a bare-string
/// const (current contract tested here so a future change is
/// const (current contract, tested here so a future change is
/// caught explicitly).
#[test]
fn const_parse_extremes_and_fallback() {

View file

@ -25,7 +25,7 @@ pub fn copy_propagate(body: &mut SsaBody, cfg: &Cfg) -> (usize, HashMap<SsaValue
if uses.len() == 1 {
let src = uses[0];
let info = &cfg[inst.cfg_node];
// Skip if the node has labels sanitizers, sources, sinks
// Skip if the node has labels, sanitizers, sources, sinks
// have semantic meaning that must be preserved.
if !info.taint.labels.is_empty() {
continue;
@ -244,7 +244,7 @@ mod tests {
}
/// A four-deep copy chain v3 = v2 = v1 = v0 must collapse to v0
/// in a single `copy_propagate` pass the resolved replacement
/// in a single `copy_propagate` pass, the resolved replacement
/// map drives downstream alias recovery, so the *transitive*
/// closure must be exposed, not just the immediate parent.
#[test]
@ -390,7 +390,7 @@ mod tests {
}
/// Skip path 2: numeric-length reads (`arr.length`, `map.size`)
/// have a different type from their source propagating through
/// have a different type from their source, propagating through
/// would erase the Int type fact.
#[test]
fn copy_through_numeric_length_access_is_not_propagated() {

View file

@ -51,7 +51,7 @@ pub fn eliminate_dead_defs(body: &mut SsaBody, cfg: &Cfg) -> usize {
/// condition variable. Without counting these, a value used solely by a
/// terminator (the canonical case for short helpers like
/// `def f(s): return s`) is judged dead, and DCE strips every instruction
/// in the body leaving empty blocks whose terminators reference
/// in the body, leaving empty blocks whose terminators reference
/// nonexistent SsaValues, breaking downstream analyses (per-return-path
/// PathFact narrowing, inline-summary extraction, etc.).
fn build_use_counts(body: &SsaBody) -> HashMap<SsaValue, usize> {
@ -170,8 +170,8 @@ mod tests {
#[test]
fn dead_const_removed() {
// v0 = const("42") unused, should be removed
// v1 = source() must survive even if unused
// v0 = const("42"), unused, should be removed
// v1 = source(), must survive even if unused
let mut cfg: Cfg = Graph::new();
let n0 = cfg.add_node(make_cfg_node(StmtKind::Seq));
let n1 = cfg.add_node(make_cfg_node(StmtKind::Seq));
@ -228,7 +228,7 @@ mod tests {
#[test]
fn dead_sanitizer_label_preserved() {
// v0 has a Sanitizer label on its CFG node must survive even if unused
// v0 has a Sanitizer label on its CFG node, must survive even if unused
use crate::labels::{Cap, DataLabel};
let mut cfg: Cfg = Graph::new();
@ -277,7 +277,7 @@ mod tests {
#[test]
fn dead_source_label_preserved() {
// v0 has a Source label on its CFG node must survive even if unused
// v0 has a Source label on its CFG node, must survive even if unused
use crate::labels::{Cap, DataLabel};
let mut cfg: Cfg = Graph::new();
@ -541,7 +541,7 @@ mod tests {
#[test]
fn used_def_preserved() {
// v0 = const("42"), v1 = assign(v0) v0 is used, both survive
// v0 = const("42"), v1 = assign(v0), v0 is used, both survive
let mut cfg: Cfg = Graph::new();
let n0 = cfg.add_node(make_cfg_node(StmtKind::Seq));
let n1 = cfg.add_node(make_cfg_node(StmtKind::Seq));
@ -597,7 +597,7 @@ mod tests {
}
/// DCE must NEVER remove a Call instruction even when its result has
/// zero uses calls have side effects (I/O, throws, mutations) that
/// zero uses, calls have side effects (I/O, throws, mutations) that
/// cannot be modeled as SSA-value uses. This is the conservative
/// invariant `is_dead()` enforces; regressing it would silently drop
/// real-world code from analysis (sinks, sanitizers expressed as

View file

@ -8,7 +8,7 @@
//! Key design:
//! - HeapObjectId is keyed by allocation-site SsaValue (deterministic, zero-cost)
//! - PointsToSet is bounded to `analysis.engine.max_pointsto` entries
//! (default 32, widening on overflow see [`effective_max_pointsto`]).
//! (default 32, widening on overflow, see [`effective_max_pointsto`]).
//! Overflow drops emit an [`crate::engine_notes::EngineNote::PointsToTruncated`]
//! note and increment [`POINTSTO_TRUNCATION_COUNT`] so operators can
//! tell when the cap is firing on their corpus.
@ -16,7 +16,7 @@
//! - HeapSlot::Index(u64) for constant-index container access (proven by const propagation)
//! - HeapSlot::Elements for coarse element access (push/pop, dynamic index, overflow)
//! - Intraprocedural: constant-index sensitivity is guaranteed when const propagation proves it
//! - Interprocedural: best-effort relies on correct const_values threading (already handled)
//! - Interprocedural: best-effort, relies on correct const_values threading (already handled)
//! - Unknown/unproven indices fall back to Elements (conservative)
//! - Analysis runs as a pre-pass in optimize_ssa(), like type_facts
@ -32,7 +32,7 @@ use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
use std::collections::HashMap;
// Heap origin cap used to be `const MAX_HEAP_ORIGINS: usize = 4` now
// Heap origin cap used to be `const MAX_HEAP_ORIGINS: usize = 4`, now
// governed by the shared `analysis.engine.max_origins` knob through
// `crate::taint::ssa_transfer::push_origin_bounded`. Unifying the two
// lattices behind a single tunable means operators raise *one* value to
@ -47,7 +47,7 @@ static MAX_POINTSTO_OVERRIDE: std::sync::atomic::AtomicUsize =
/// Total heap-object members dropped by [`PointsToSet`] truncation since
/// the last reset. Captured from `insert`/`union` so tests (and
/// operators inspecting scan output) can detect truncation events that
/// don't propagate to a finding e.g. when the cap is tight enough
/// don't propagate to a finding, e.g. when the cap is tight enough
/// that no taint flow survives to emit a sink event.
pub(crate) static POINTSTO_TRUNCATION_COUNT: std::sync::atomic::AtomicUsize =
std::sync::atomic::AtomicUsize::new(0);
@ -114,7 +114,7 @@ pub const MAX_TRACKED_INDICES: usize = 8;
/// Distinguishes constant-index container access from coarse element access.
///
/// `Elements` is the conservative default all container elements merge into
/// `Elements` is the conservative default, all container elements merge into
/// a single taint. `Index(n)` provides per-index precision when the index is
/// provably a non-negative integer constant (via the function's own const
/// propagation pass).
@ -302,10 +302,10 @@ impl HeapTaint {
/// union of per-slot taint), matching the `SsaTaintState` pattern.
///
/// Load semantics:
/// - `load(id, Index(n))`: union of `(id, Index(n))` and `(id, Elements)`
/// - `load(id, Index(n))`: union of `(id, Index(n))` and `(id, Elements)` ,
/// indexed reads also see taint from dynamic/push operations.
/// - `load(id, Elements)`: union of `(id, Elements)` and ALL `(id, Index(*))`
/// entries dynamic reads conservatively see all indexed taint.
/// entries, dynamic reads conservatively see all indexed taint.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct HeapState {
entries: SmallVec<[((HeapObjectId, HeapSlot), HeapTaint); 4]>,
@ -927,7 +927,7 @@ mod tests {
set_max_pointsto_override(4);
reset_points_to_observability();
// a = {0,1,2,3}, b = {4,5,6} union wants 7 members; cap is 4
// a = {0,1,2,3}, b = {4,5,6}, union wants 7 members; cap is 4
// so 3 members are dropped. Deterministic order: smallest
// ids survive.
let mut a = PointsToSet::empty();
@ -1215,7 +1215,7 @@ mod tests {
#[test]
fn heap_elements_load_unions_all_indices() {
// Store to Index(0) and Index(2) Elements load should see both
// Store to Index(0) and Index(2), Elements load should see both
let mut h = HeapState::empty();
let id = HeapObjectId(SsaValue(0));
h.store(id, HeapSlot::Index(0), Cap::HTML_ESCAPE, &[origin(0)]);

View file

@ -20,33 +20,33 @@
//!
//! Invariants are split into two groups:
//!
//! **Group A SSA integrity (must hold unconditionally):**
//! **Group A, SSA integrity (must hold unconditionally):**
//!
//! 1. `BlockId` indexing `blocks[i].id == BlockId(i)`
//! 1. `BlockId` indexing, `blocks[i].id == BlockId(i)`
//! 2. Entry block has no predecessors
//! 3. Pred/succ symmetry `B.succs.contains(S)` ⇔ `S.preds.contains(B)`
//! 4. Phi placement every phi appears in `block.phis` (never in body)
//! 5. Phi operand arity ≤ `block.preds.len()`
//! 6. Phi operand sources every `(pred_bid, _)` operand has
//! 3. Pred/succ symmetry, `B.succs.contains(S)` ⇔ `S.preds.contains(B)`
//! 4. Phi placement, every phi appears in `block.phis` (never in body)
//! 5. Phi operand arity, ≤ `block.preds.len()`
//! 6. Phi operand sources, every `(pred_bid, _)` operand has
//! `block.preds.contains(pred_bid)`
//! 7. Unique SSA definitions every `SsaValue` is defined at most once
//! 7. Unique SSA definitions, every `SsaValue` is defined at most once
//! across all phi + body instructions
//! 8. `value_defs` coverage every defined `SsaValue.0` is a valid index
//! 8. `value_defs` coverage, every defined `SsaValue.0` is a valid index
//! into `value_defs`, and `value_defs[v.0].block` matches the block
//! containing the defining instruction
//! 9. `cfg_node_map` consistency every `(node, SsaValue)` pair points
//! 9. `cfg_node_map` consistency, every `(node, SsaValue)` pair points
//! to an instruction whose `cfg_node == node`
//!
//! **Group B terminator and reachability (loose, reflecting lowering):**
//! **Group B, terminator and reachability (loose, reflecting lowering):**
//!
//! 10. Terminator/succs agreement *subset* form:
//! * `Goto(t)` → `succs.contains(t)` extras tolerated
//! * `Goto(t)` → `succs.contains(t)`, extras tolerated
//! (3-successor collapse fallback)
//! * `Branch{t, f, …}` → `succs` contains both `t` and `f`
//! * `Return`/`Unreachable` → no constraint on `succs` (CFG may carry
//! finally/cleanup continuation edges that downstream analysis
//! propagates through)
//! 11. Reachability from entry tolerated exceptions:
//! 11. Reachability from entry, tolerated exceptions:
//! * blocks that appear as the `catch` side of an exception edge
//!
//! Group B is deliberately permissive: the SSA body's `succs` field is the
@ -61,8 +61,8 @@ use super::ir::*;
/// Errors returned by targeted invariant checks.
///
/// Wraps a list of human-readable violation messages one per offending
/// block so callers can include every failure in a single panic /
/// Wraps a list of human-readable violation messages, one per offending
/// block, so callers can include every failure in a single panic /
/// warning.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct InvariantError {
@ -106,12 +106,12 @@ pub fn check_structural_invariants(body: &SsaBody) -> Vec<String> {
errors
}
/// Every block carrying an [`SsaOp::CatchParam`] an exception-handler
/// entry must be reachable from either the function entry (via normal
/// Every block carrying an [`SsaOp::CatchParam`], an exception-handler
/// entry, must be reachable from either the function entry (via normal
/// flow) or from at least one entry in [`SsaBody::exception_edges`].
///
/// When this fails, the CFG builder has produced an orphan catch block
/// that should have been wired up as an exception successor but was not
/// that should have been wired up as an exception successor but was not ,
/// a real construction bug that otherwise manifests as silent false
/// negatives in resource-cleanup / exception-flow findings.
pub fn check_catch_block_reachability(body: &SsaBody) -> Result<(), InvariantError> {
@ -252,7 +252,7 @@ fn check_pred_succ_symmetry(body: &SsaBody, errors: &mut Vec<String>) {
}
fn check_terminator_succ_agreement(body: &SsaBody, errors: &mut Vec<String>) {
// Group B loose agreement. See module docs for rationale.
// Group B, loose agreement. See module docs for rationale.
for block in &body.blocks {
match &block.terminator {
Terminator::Goto(target) => {
@ -301,7 +301,7 @@ fn check_terminator_succ_agreement(body: &SsaBody, errors: &mut Vec<String>) {
}
}
Terminator::Return(_) | Terminator::Unreachable => {
// Loose by design cleanup/finally continuation edges in
// Loose by design, cleanup/finally continuation edges in
// `succs` are expected. Downstream consumers (taint
// `compute_succ_states`, SCCP `process_terminator`) treat
// `succs` as authoritative and propagate across these edges,
@ -443,7 +443,7 @@ fn check_reachability(body: &SsaBody, errors: &mut Vec<String>) {
// Multi-root BFS: start from the entry *and* from every catch target
// recorded in `exception_edges`. Exception-handler blocks are reached
// via stripped exception edges, so from the SSA body's perspective they
// look like roots as does anything transitively reachable from them
// look like roots, as does anything transitively reachable from them
// (e.g. a `finally` block chained after a `catch`).
let mut visited = vec![false; n];
let mut stack: Vec<BlockId> = Vec::new();
@ -487,7 +487,7 @@ fn check_reachability(body: &SsaBody, errors: &mut Vec<String>) {
/// fingerprint have the same block structure, terminator shape, per-block
/// phi/body instruction counts and op-kind sequences. SsaValue numbers are
/// not part of the fingerprint, so renumbering between runs does not cause
/// spurious diffs only shape changes do.
/// spurious diffs, only shape changes do.
///
/// Phis are emitted in their natural (insertion) order. Lowering now drives
/// phi placement through a `BTreeSet`, so that order is deterministic

View file

@ -24,21 +24,14 @@ pub struct BlockId(pub u32);
pub struct FieldId(pub u32);
impl FieldId {
/// Pointer-Phase 4 sentinel for the abstract "any element of a
/// container" field. Steensgaard-grade precision: every numeric
/// or dynamic index access (`arr[i]`, `arr.shift()`, `map[k]`)
/// projects through the same `Field(pt(container), ELEM)` cell so
/// per-element taint propagation is independent of the SSA value
/// referencing the container.
///
/// `u32::MAX` is reserved by convention; the per-body
/// [`FieldInterner`] never assigns it because interning is
/// monotone-ascending from `0` and bodies don't approach 4 billion
/// fields. Consumers should compare with `==` rather than reach
/// into the wrapped `u32`.
/// Sentinel for the abstract "any element of a container" field.
/// Every numeric or dynamic index access (`arr[i]`, `arr.shift()`,
/// `map[k]`) projects through the same `Field(pt(container), ELEM)`
/// cell. `u32::MAX` is reserved; the per-body interner never
/// assigns it.
pub const ELEM: FieldId = FieldId(u32::MAX);
/// "Tainted at every field" wildcard sentinel distinct from
/// "Tainted at every field" wildcard sentinel, distinct from
/// [`Self::ELEM`] (which is container-element semantics: every
/// numeric/dynamic index access projects through it).
/// `ANY_FIELD` represents the case where a writeback-shaped sink
@ -91,17 +84,14 @@ impl FieldInterner {
/// Read-only lookup: returns the [`FieldId`] for `name` if it has
/// already been interned, or `None` otherwise.
///
/// Used by cross-call resolvers (Pointer-Phase 5 / W3) to avoid
/// growing the caller's interner with field names introduced
/// solely by the callee summary — such IDs would never be referenced
/// by any other instruction in the caller's body, so the cells
/// would be write-only and consume space without contributing
/// to taint flow.
/// Used by cross-call resolvers to avoid growing the caller's
/// interner with field names introduced solely by callee summaries
///, such cells would be write-only.
pub fn lookup(&self, name: &str) -> Option<FieldId> {
// Walk `names` directly so we don't require the post-deserialise
// `ensure_lookup()` rebuild before this method is callable.
// Callers usually own `&SsaBody` interning was either done at
// lowering time or via `ensure_lookup` post-deserialise so the
// Callers usually own `&SsaBody`, interning was either done at
// lowering time or via `ensure_lookup` post-deserialise, so the
// hot path goes through the `lookup` table; the linear walk is
// a fallback for the (small) deserialised-but-not-rebuilt case.
if let Some(&id) = self.lookup.get(name) {
@ -168,7 +158,7 @@ pub enum SsaOp {
Call {
callee: String,
/// Original textual full path when SSA decomposed a chained receiver.
/// `None` when the callee was not rewritten `callee` already holds
/// `None` when the callee was not rewritten, `callee` already holds
/// the source-level textual form.
///
/// **Debug / display only.** Analysis code must walk the SSA receiver
@ -188,7 +178,7 @@ pub enum SsaOp {
/// Models member-access expressions (`obj.field`) as a first-class SSA
/// op. Lowering walks the receiver tree so chained accesses like
/// `c.writer.header` produce a chain of `FieldProj` ops with explicit
/// per-step receivers eliminating the textual-prefix parsing that
/// per-step receivers, eliminating the textual-prefix parsing that
/// previously misclassified deep receivers (the gin/context.go FP).
///
/// `field` is interned in the owning [`SsaBody`]'s [`FieldInterner`].
@ -223,7 +213,7 @@ pub enum SsaOp {
///
/// Emitted by SSA lowering as a synthesized instruction in the entry
/// block and referenced from phi operands whose incoming edge does
/// not carry a definition of the phi's variable e.g. a try/catch
/// not carry a definition of the phi's variable, e.g. a try/catch
/// rejoin where a variable is only defined on the normal path, or
/// an early-return branch on a later-defined variable.
///
@ -269,7 +259,7 @@ pub enum Terminator {
/// `targets` lists the per-case successor blocks (order matches the
/// source-order of cases in the switch); `default` is the fallback
/// branch taken when no case matches. Block `succs` remain the
/// authoritative flow set the terminator is a structured summary.
/// authoritative flow set, the terminator is a structured summary.
///
/// Emitted only for switch-like dispatch whose semantics are
/// guaranteed-exclusive across cases (e.g. Go `switch`, Java
@ -285,11 +275,11 @@ pub enum Terminator {
///
/// `Some(c)` records the constant value the scrutinee must equal for
/// the corresponding target to be taken. `None` means the literal is
/// unknown emitted for synthetic ≥3-way CFG fanouts or for case
/// unknown, emitted for synthetic ≥3-way CFG fanouts or for case
/// patterns that aren't plain literals (OR-patterns, ranges, guards).
///
/// When omitted/empty (length zero), all targets behave as "unknown
/// literal" preserves backward compatibility with consumers that
/// literal", preserves backward compatibility with consumers that
/// only inspect `targets`/`default`.
#[serde(default)]
case_values: SmallVec<[Option<ConstValue>; 4]>,
@ -342,19 +332,17 @@ pub struct SsaBody {
pub exception_edges: Vec<(BlockId, BlockId)>,
/// Per-body interner for [`SsaOp::FieldProj`] field names.
///
/// Empty until the lowering phase emits FieldProj ops (Phase 2 of the
/// field-projections rollout). Cross-body callers (cross-file
/// summaries, debug serialization) MUST resolve interned ids through
/// this interner before transporting field references to other bodies.
/// Empty until lowering emits FieldProj ops. Cross-body callers
/// (cross-file summaries, debug serialization) MUST resolve interned
/// ids through this interner before transporting field references
/// to other bodies.
#[serde(default)]
pub field_interner: FieldInterner,
/// Pointer-Phase 3 / W1: side-table mapping a synthetic base-update
/// [`SsaOp::Assign`]'s defined value back to the `(receiver, field)`
/// pair it represents. Populated by SSA lowering at the
/// `obj.f = rhs` synthesis point so the taint engine can recognise
/// the synthetic assign as a structural field WRITE — the assigned
/// value is the new "obj" value, the use is the rhs, and the side-
/// table records `(prior_obj_value, FieldId("f"))`.
/// Side-table mapping a synthetic base-update [`SsaOp::Assign`]'s
/// defined value back to the `(receiver, field)` pair it
/// represents. Populated by lowering at the `obj.f = rhs` synthesis
/// point so the taint engine can treat the synthetic assign as a
/// structural field WRITE.
///
/// Empty by default; only synthetic assigns whose enclosing source
/// statement was a dotted-path assignment (`a.b.c = …`) appear here.
@ -505,10 +493,10 @@ mod tests {
assert_eq!(uses, vec![SsaValue(1)]);
}
/// Pointer-Phase 4 / A6 audit: the [`FieldId::ELEM`] sentinel is
/// the [`FieldId::ELEM`] sentinel is
/// reserved for "any element of a container". The interner assigns
/// IDs monotonically from `0`, so the sentinel `u32::MAX` can only
/// collide if the body declares ~4 billion fields a corner case
/// collide if the body declares ~4 billion fields, a corner case
/// no realistic codebase reaches. Pin the contract with a stress
/// loop so future implementation drift can't silently shift IDs to
/// the sentinel value.
@ -526,7 +514,7 @@ mod tests {
// Lookup of the sentinel name (used by W3 to round-trip
// container-element flow through summary) must NOT match a
// real interned name even when the same name is interned.
// The wire-format keeps `<elem>` as a *string marker* it
// The wire-format keeps `<elem>` as a *string marker*, it
// never goes through `intern`. Instead, callers compare
// explicitly against `FieldId::ELEM`.
assert_ne!(interner.intern("<elem>"), FieldId::ELEM);

View file

@ -29,16 +29,16 @@ use super::ir::*;
/// - Construct the `Call` op with `callee = bare_method_name`,
/// `callee_text = Some(original_callee.to_string())`,
/// `receiver = Some(final_receiver_value)`.
/// - Use the returned receiver as the implicit method receiver do NOT
/// - Use the returned receiver as the implicit method receiver, do NOT
/// add the chain root or any intermediate field name to `args`.
///
/// **Decomposition rules** (Phase 2 of the field-projections rollout):
/// **Decomposition rules**:
/// - Skip when the callee contains zero `.` characters (no member access)
/// or only one `.` (single-dot case is handled by the existing
/// `info.call.receiver` channel without needing a `FieldProj` op).
/// - Bail when any "complex" token appears in the callee `(`, `)`,
/// - Bail when any "complex" token appears in the callee, `(`, `)`,
/// `[`, `]`, `::`, `->`, `?`, `<`, `>`, `*`, `&`, `:` (other than `::`
/// already filtered), or whitespace signaling the callee text isn't
/// already filtered), or whitespace, signaling the callee text isn't
/// a clean `<ident>.<ident>...` chain we can safely split on `.`.
/// - The first segment must be a known SSA variable in `var_stacks`;
/// otherwise the chain root is unresolvable and we bail.
@ -221,7 +221,7 @@ fn lower_to_ssa_inner(
// 4b. For per-function scope: identify external variables (used but not defined)
// and inject synthetic Param defs at entry block so rename can find them.
// When formal_params is supplied, reorder so formal params come first in
// declaration order this makes Param indices correspond to call-site positions.
// declaration order, this makes Param indices correspond to call-site positions.
//
let external_vars = if scope.is_some() && !scope_all && !scope_nop {
let raw = identify_external_uses(cfg, &blocks_nodes, &var_defs);
@ -277,7 +277,7 @@ fn lower_to_ssa_inner(
}
// 7b. Debug assertions: verify structural invariants.
// The helper body is `debug_assert!` only, so it's a no-op in release
// The helper body is `debug_assert!` only, so it's a no-op in release ,
// call unconditionally to avoid a dead_code warning when the lib is
// built without `--tests`.
debug_assert_bfs_ordering(&block_preds);
@ -451,10 +451,10 @@ fn collect_reachable(
/// Form basic blocks from filtered CFG nodes.
///
/// Returns:
/// - blocks_nodes: Vec<Vec<NodeIndex>> nodes per block (in order)
/// - block_of_node: HashMap<NodeIndex, usize> node → block index
/// - block_succs: Vec<Vec<usize>> successors per block
/// - block_preds: Vec<Vec<usize>> predecessors per block
/// - blocks_nodes: Vec<Vec<NodeIndex>>, nodes per block (in order)
/// - block_of_node: HashMap<NodeIndex, usize>, node → block index
/// - block_succs: Vec<Vec<usize>>, successors per block
/// - block_preds: Vec<Vec<usize>>, predecessors per block
fn form_blocks(
cfg: &Cfg,
entry: NodeIndex,
@ -537,7 +537,7 @@ fn form_blocks(
// Discover leaders in BFS order over `cfg`, but skip edges whose
// source is a terminating (Return / Throw) node. Walking the raw
// `cfg` directly here would re-introduce the bookkeeping
// Return/Throw → fn_exit edges we just stripped fn_exit (or any
// Return/Throw → fn_exit edges we just stripped, fn_exit (or any
// post-return join) would be discovered through them and assigned a
// block ID before its true block-level predecessors, breaking the
// BFS-forward-pred invariant (`debug_assert_bfs_ordering`).
@ -546,7 +546,7 @@ fn form_blocks(
// exception edges entirely (collect_reachable strips them and records
// them separately in `exception_edges`). Catch-block nodes are still
// in `reachable` and must be discoverable as leaders via the
// try-body → catch path only the terminating-source bookkeeping
// try-body → catch path, only the terminating-source bookkeeping
// edges are bogus.
{
let mut bfs_queue: VecDeque<NodeIndex> = VecDeque::new();
@ -572,7 +572,7 @@ fn form_blocks(
// Belt-and-braces: any leader still unvisited gets appended in
// CFG-node-index order so block-ID assignment remains
// deterministic. We do NOT include the synthetic function-exit
// node when it is unreachable through filtered edges that
// node when it is unreachable through filtered edges, that
// happens whenever every path in the body terminates explicitly
// (e.g. a function whose only return is `return buf.toString()`
// at the tail). Including it would emit an orphan SSA block
@ -760,19 +760,19 @@ pub(crate) fn is_receiver_name(name: &str) -> bool {
/// on to emit one [`SsaOp::SelfParam`] (for the leading receiver slot, when
/// present) followed by a contiguous run of [`SsaOp::Param { index }`] values
/// whose indices 0..N correspond exactly to positional call-site argument
/// positions no receiver offset required anywhere downstream.
/// positions, no receiver offset required anywhere downstream.
///
/// W1.b: every formal parameter gets a Param op even when the body never
/// references it directly. Without this, the *first* `obj.f = rhs` on a
/// formal `obj` whose body never reads `obj` produces no W1
/// `field_writes` entry `var_stacks["obj"]` is empty when the synth
/// `field_writes` entry, `var_stacks["obj"]` is empty when the synth
/// Assign runs because no external-use path interned `obj`. Subsequent
/// writes work because the synth Assign itself defines `obj`, so the
/// gap is exactly the FIRST write. Always emitting a formal Param at
/// block 0 closes that gap.
fn reorder_external_vars(external: Vec<String>, formal_params: &[String]) -> Vec<String> {
if formal_params.is_empty() {
return external; // no reordering preserve existing alphabetical sort
return external; // no reordering, preserve existing alphabetical sort
}
let ext_set: HashSet<&str> = external.iter().map(|s| s.as_str()).collect();
let formal_set: HashSet<&str> = formal_params.iter().map(|s| s.as_str()).collect();
@ -789,7 +789,7 @@ fn reorder_external_vars(external: Vec<String>, formal_params: &[String]) -> Vec
}
// Formal positional params next (declaration order), skipping any
// receiver that was already emitted above. W1.b: include EVERY
// formal regardless of whether the body uses it externally an
// formal regardless of whether the body uses it externally, an
// unused formal that gets field-written via `obj.cache = rhs` still
// needs a Param op so the synth Assign loop sees its prior reaching
// def in `var_stacks`.
@ -865,7 +865,7 @@ fn collect_var_defs(
/// Returns a `BTreeSet<String>` per block so downstream consumers that iterate
/// the set (notably `rename_variables`) observe a deterministic, alphabetical
/// order regardless of the underlying hasher state. The Cytron algorithm
/// itself is order-independent only its observers are.
/// itself is order-independent, only its observers are.
fn insert_phis(
var_defs: &BTreeMap<String, HashSet<usize>>,
dom_frontiers: &[HashSet<usize>],
@ -882,7 +882,7 @@ fn insert_phis(
for &f in &dom_frontiers[b] {
if has_phi.insert(f) {
phi_placements[f].insert(var.clone());
// Phi is a new definition add to worklist
// Phi is a new definition, add to worklist
if !def_blocks.contains(&f) {
worklist.push_back(f);
}
@ -945,7 +945,7 @@ fn rename_variables(
// empty otherwise so existing per-statement Call lowering is
// bit-for-bit unchanged.
let mut field_interner = crate::ssa::ir::FieldInterner::new();
// Pointer-Phase 3 / W1: side-table mapping each synthetic base-update
//side-table mapping each synthetic base-update
// [`SsaOp::Assign`]'s defined value to its `(receiver, field)` pair.
// Populated below at the synthetic-Assign emission site. Read by
// the taint engine to lift the assign into a structural field WRITE.
@ -968,7 +968,7 @@ fn rename_variables(
// `BTreeMap` guarantees a deterministic (alphabetical) iteration order when
// pushing phi values onto `var_stacks` and when filling operands on
// successor phis both sites are observable in SSA numbering if they
// successor phis, both sites are observable in SSA numbering if they
// reordered between runs.
let mut phi_values: Vec<BTreeMap<String, SsaValue>> = vec![BTreeMap::new(); num_blocks];
@ -1118,14 +1118,14 @@ fn rename_variables(
.any(|l| matches!(l, crate::labels::DataLabel::Source(_)))
&& info.call.callee.is_none()
{
// Pure source (e.g. $_GET, env var) no callee, so no args to track.
// Pure source (e.g. $_GET, env var), no callee, so no args to track.
// Source-labeled calls (e.g. file_get_contents) fall through to Call
// so argument taint and sink detection still work.
SsaOp::Source
} else if info.call.callee.is_some() {
let callee = info.call.callee.as_deref().unwrap_or("").to_string();
let (mut args, mut receiver) = build_call_args(info, var_stacks);
// Phase 2: try decomposing chained-receiver method calls
// try decomposing chained-receiver method calls
// (`a.b.c()`) into a FieldProj chain plus a bare-method Call
// so downstream consumers can read the receiver structure
// without re-parsing the callee text. Bails to None on any
@ -1145,7 +1145,7 @@ fn rename_variables(
Some((recv_v, bare_method)) => {
receiver = Some(recv_v);
// Strip any positional arg group that exactly matches the
// chain root identifier it has been replaced by the
// chain root identifier, it has been replaced by the
// FieldProj chain receiver, and re-listing it as an
// argument would inflate arity / double-taint.
if let Some(base_ident) = callee.split('.').next() {
@ -1175,7 +1175,7 @@ fn rename_variables(
// Reassignment kill: a node that defines a variable but has no
// uses (operands) and is not a source is a constant/literal
// assignment. SSA rename allocates a fresh SsaValue, so
// downstream references see this new (untainted) value the
// downstream references see this new (untainted) value, the
// prior tainted definition is implicitly dead.
SsaOp::Const(info.taint.const_text.clone())
} else if info.taint.defines.is_some() {
@ -1217,12 +1217,12 @@ fn rename_variables(
// `Assign(uses)` so the SSA carries an explicit pass-through
// for the returned/thrown value. Without this, the Return
// node was lowered as a `Nop` and the terminator-setup
// "last non-Nop body inst" search returned None producing
// "last non-Nop body inst" search returned None, producing
// `Terminator::Return(None)` for a function that visibly
// returns an identifier. That broke per-return-path
// PathFact narrowing for non-Rust languages where the
// returned identifier wasn't computed in the same block
// (e.g. Python `def f(s): return s` `s` is a Param in
// (e.g. Python `def f(s): return s`, `s` is a Param in
// block 0, the Return block itself has no body insts).
let uses: SmallVec<[SsaValue; 4]> = info
.taint
@ -1250,8 +1250,8 @@ fn rename_variables(
} else if info.call.callee.is_some() {
let callee = info.call.callee.as_deref().unwrap_or("").to_string();
let (mut args, mut receiver) = build_call_args(info, var_stacks);
// Phase 2: same FieldProj-chain decomposition as the primary
// Call branch above kept in sync because this fallback
// same FieldProj-chain decomposition as the primary
// Call branch above, kept in sync because this fallback
// path also constructs SSA Call ops (used for control-flow
// wrapper calls that landed past the earlier match arms).
let (final_callee, callee_text) = match try_lower_field_proj_chain(
@ -1342,9 +1342,9 @@ fn rename_variables(
// overwrites properly kill taint: if obj.data is re-assigned to a
// constant, the base `obj` no longer carries that field's taint.
//
// Pointer-Phase 3 / W1: each synthetic Assign also records its
// structural identity into `field_writes` `(receiver_old_value,
// FieldId(field_name))` so the taint engine can recognise the
//each synthetic Assign also records its
// structural identity into `field_writes`, `(receiver_old_value,
// FieldId(field_name))`, so the taint engine can recognise the
// synthetic assign as a field WRITE and mirror the rhs taint
// into the matching `(loc, field)` cell on `SsaTaintState`.
// The "old" parent value is the reaching def of `parent` BEFORE
@ -1427,9 +1427,9 @@ fn rename_variables(
ssa_blocks[block_idx].terminator = if succs.is_empty() {
// A block with no successors at the block level is one of:
// (1) a block containing a Throw terminates with an
// (1) a block containing a Throw, terminates with an
// exception; no normal fall-through.
// (2) a block containing a Return terminates with a value
// (2) a block containing a Return, terminates with a value
// (or void). After form_blocks strips the bookkeeping
// Seq edge from Return → fn_exit, every explicit-return
// block lands here, including `if cond { return X; }`
@ -1458,7 +1458,7 @@ fn rename_variables(
let return_info = &cfg[rn];
// Return-value resolution. Mirror the legacy
// `has_const_return` path so callers see exactly the same
// SSA shape they did before the merged-return fix only
// SSA shape they did before the merged-return fix, only
// the *terminator* changes (Goto(exit) → Return(_)), not
// the value selection.
//
@ -1468,7 +1468,7 @@ fn rename_variables(
// Emit a synthetic Const inst so taint never leaks
// from an unrelated inst earlier in the same block
// (regression guard: C-1 inline-return precision).
// (b) Computed / passthrough return last non-Nop body
// (b) Computed / passthrough return, last non-Nop body
// inst. Covers `return foo()` (Call sits before the
// Return Nop), `return x + y` (Assign), and the
// implicit tail expression collapsed into a single
@ -1476,9 +1476,9 @@ fn rename_variables(
// Return carries identifier uses (`return req`,
// `return { req.session, ... }`), the SSA defs for
// those identifiers are already on the body as
// Param / Assign / Source insts picking the last
// Param / Assign / Source insts, picking the last
// one matches pre-fix behaviour exactly.
// (c) Void / unresolved `Return(None)`.
// (c) Void / unresolved, `Return(None)`.
if return_info.taint.uses.is_empty() {
let const_text = return_info.taint.const_text.clone();
let const_v = SsaValue(*next_value);
@ -1507,7 +1507,7 @@ fn rename_variables(
Terminator::Return(from_body)
}
} else {
// (3) fn_exit / true fall-off no Return CFG node in this
// (3) fn_exit / true fall-off, no Return CFG node in this
// block. Use the last non-Nop body instruction as the
// implicit return value (e.g. the function's tail-position
// expression in Rust).
@ -1575,7 +1575,7 @@ fn rename_variables(
condition,
}
} else {
// More than 2 successors model as a multi-way Switch.
// More than 2 successors, model as a multi-way Switch.
//
// This replaces the previous `Goto(first)` collapse: the
// structured terminator now enumerates every target instead
@ -1594,7 +1594,7 @@ fn rename_variables(
//
// Scrutinee: use the primary SSA value defined at the last
// node in this block when one exists; fall back to
// `SsaValue(0)` (a valid index SSA numbering is 1-based
// `SsaValue(0)` (a valid index, SSA numbering is 1-based
// only conceptually, and value 0 is always present in a
// non-empty body) when no value is defined. Downstream
// consumers that care about the scrutinee (abstract interp,
@ -1604,7 +1604,7 @@ fn rename_variables(
let targets: SmallVec<[BlockId; 4]> =
succs.iter().skip(1).map(|&s| BlockId(s as u32)).collect();
let default = BlockId(succs[0] as u32);
// Synthetic ≥3-way fanouts have no per-case literal metadata
// Synthetic ≥3-way fanouts have no per-case literal metadata ,
// every entry is None (unknown), so the executor falls back to
// first-reachable behavior on this terminator.
let case_values: SmallVec<[Option<crate::constraint::domain::ConstValue>; 4]> =
@ -1815,7 +1815,7 @@ fn debug_assert_bfs_ordering(block_preds: &[Vec<usize>]) {
/// predecessor of the block.
///
/// Runs in release builds because phi-operand mismatches are
/// load-bearing for soundness downstream taint, const, and abstract
/// load-bearing for soundness, downstream taint, const, and abstract
/// analyses iterate phi operands by `(pred_blk, value)` pairs, and
/// either a missing operand (silent "no contribution" on that edge)
/// or a phantom operand (garbage into the join) corrupts analysis
@ -1824,7 +1824,7 @@ fn debug_assert_bfs_ordering(block_preds: &[Vec<usize>]) {
/// The invariant is strict equality. Predecessors that carry no
/// reaching definition for the phi's variable are filled with the
/// [`SsaOp::Undef`] sentinel in `fill_undef_phi_operands`, rather than
/// being dropped so consumers that look up by `(pred_blk, value)`
/// being dropped, so consumers that look up by `(pred_blk, value)`
/// see a real operand for every control-flow edge.
fn assert_phi_operand_counts(ssa_blocks: &[SsaBlock], block_preds: &[Vec<usize>]) {
use std::collections::HashSet;
@ -1887,7 +1887,7 @@ fn assert_phi_operand_counts(ssa_blocks: &[SsaBlock], block_preds: &[Vec<usize>]
/// single shared sentinel instruction ([`SsaOp::Undef`]) synthesized
/// at the end of block 0's body. Consumers iterate phi operands by
/// `(pred_blk, value)` and therefore see a real operand on every
/// control-flow edge no implicit "missing = empty" semantics.
/// control-flow edge, no implicit "missing = empty" semantics.
///
/// The Undef instruction is created lazily (only when at least one phi
/// has a gap) so functions with fully-dominating definitions pay zero
@ -1931,7 +1931,7 @@ fn fill_undef_phi_operands(
block: BlockId(0),
});
// Place the Undef instruction at the end of block 0's body so it
// appears after any synthetic Param / SelfParam emissions its
// appears after any synthetic Param / SelfParam emissions, its
// only role is to anchor the SsaValue; ordering relative to other
// body instructions is cosmetic (no consumer depends on its
// position, only on the value lookup).
@ -2181,7 +2181,7 @@ mod tests {
#[test]
fn bfs_ordering_holds_for_linear_cfg() {
// Entry → A → B → Exit all blocks should satisfy BFS ordering
// Entry → A → B → Exit, all blocks should satisfy BFS ordering
let mut cfg: Cfg = Graph::new();
let entry = cfg.add_node(make_node(StmtKind::Entry));
let a = cfg.add_node(NodeInfo {
@ -2409,7 +2409,7 @@ mod tests {
/// predecessor and a normal control-flow predecessor must lower to a
/// consistent phi. For variables defined before the try (live on
/// *both* edges), the phi at the catch block has exactly two operands
/// — one per predecessor — and the release assertion accepts it.
///, one per predecessor, and the release assertion accepts it.
#[test]
fn catch_block_join_phi_has_operand_per_live_predecessor() {
// Entry → defines `x` → Try → (Seq) → Join ← (Exception via body) Catch
@ -2456,7 +2456,7 @@ mod tests {
cfg.add_edge(catch, join, EdgeKind::Seq);
cfg.add_edge(join, exit, EdgeKind::Seq);
// Lowering must succeed the assertion is active in release.
// Lowering must succeed, the assertion is active in release.
let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap();
// Locate the block containing a phi for `x`; it must be the join
@ -2498,7 +2498,7 @@ mod tests {
/// Regression guard for the Undef fill pass. When a variable is
/// only defined on one branch of a join (e.g. a catch-only binding
/// rejoining the normal path), the lowering must still emit one
/// phi operand per predecessor the missing edge becoming a
/// phi operand per predecessor, the missing edge becoming a
/// reference to the synthesized `SsaOp::Undef` sentinel rather
/// than being dropped.
#[test]
@ -2633,7 +2633,7 @@ mod tests {
#[should_panic(expected = "SSA phi operand count does not match predecessor count")]
fn phi_assertion_helper_rejects_more_operands_than_preds() {
// A phi with MORE operands than preds references a nonexistent
// predecessor unsound because downstream consumers either
// predecessor, unsound because downstream consumers either
// panic on the lookup or silently feed garbage taint into the
// join. Strict-equality invariant catches this.
let dummy_node = NodeIndex::new(0);
@ -2859,7 +2859,7 @@ mod tests {
/// to a synthetic exit block. Previously, the bookkeeping
/// `Return → fn_exit` `Seq` edge made early-return blocks fall into
/// the single-successor `Goto` arm, and the fall-through tail
/// expression's body got merged into the shared exit block every
/// expression's body got merged into the shared exit block, every
/// early-return path therefore appeared to also execute the tail.
/// Mirrors the `if cond { return X; } Y` shape that motivated the fix.
#[test]
@ -2876,7 +2876,7 @@ mod tests {
});
// True branch: return constant. uses=[] + const_text=Some triggers
// the literal-return path, ensuring the block emits a synthetic
// Const + Return(Some(_)) the same shape `return None` /
// Const + Return(Some(_)), the same shape `return None` /
// `return String::new()` produces in real Rust code.
let early_ret = cfg.add_node(NodeInfo {
taint: TaintMeta {
@ -2901,7 +2901,7 @@ mod tests {
cfg.add_edge(if_node, early_ret, EdgeKind::True);
cfg.add_edge(if_node, tail, EdgeKind::False);
// Bookkeeping wire-up the real CFG construction performs in
// `build_cfg` — Return / Throw → fn_exit via Seq — so the SSA
// `build_cfg`, Return / Throw → fn_exit via Seq, so the SSA
// lowering has to handle it.
cfg.add_edge(early_ret, exit, EdgeKind::Seq);
cfg.add_edge(tail, exit, EdgeKind::Seq);
@ -2909,7 +2909,7 @@ mod tests {
let ssa = lower_to_ssa(&cfg, entry, None, true).unwrap();
// Locate the block containing the early-return CFG node and
// assert it terminates with Return not Goto(_) into the
// assert it terminates with Return, not Goto(_) into the
// shared exit block.
let early_block = ssa
.blocks
@ -2936,7 +2936,7 @@ mod tests {
// The fall-through (tail) block must NOT have the early-return
// block as a predecessor. Pre-fix, both the early-return path
// and the tail path merged into the shared fn_exit block, so the
// tail's body was reachable from the early-return path that's
// tail's body was reachable from the early-return path, that's
// the merged-return defect.
let tail_block = ssa
.blocks
@ -2963,7 +2963,7 @@ mod tests {
/// `if a || b || c { return X; } Y` must have its rejection body emit a
/// `Terminator::Return(_)` and have `succs.is_empty()`. Pre-fix the
/// rejection body's String::new() Call shared a block whose only
/// successor was the merged tail losing the early-return semantics
/// successor was the merged tail, losing the early-return semantics
/// entirely and diluting per-return-path PathFact narrowing.
#[test]
fn or_chain_rejection_block_terminates_with_return() {
@ -3093,7 +3093,7 @@ mod tests {
}
// ─────────────────────────────────────────────────────────────────
// Phase 2: FieldProj chain lowering tests
// FieldProj chain lowering tests
// ─────────────────────────────────────────────────────────────────
//
// These tests pin the contract that `try_lower_field_proj_chain`
@ -3426,7 +3426,7 @@ mod tests {
assert!(blocks[0].body.is_empty());
}
// ── End-to-end Phase 2 tests via real tree-sitter parsing ──────────
// ── End-to-end SSA decomposition tests via real tree-sitter parsing ──────────
//
// These exercise the integration between CFG construction (which sets
// `info.call.callee = "c.mu.Lock"`) and SSA lowering. We assert that
@ -3451,7 +3451,7 @@ mod tests {
};
// Mirror the production lowering path: function bodies use
// lower_to_ssa_with_params so formal parameters get synthetic
// Param/SelfParam injections at block 0 without them, the
// Param/SelfParam injections at block 0, without them, the
// FieldProj chain helper has no SSA root to anchor to.
if body.meta.name.is_some() {
let func_name = body.meta.name.clone().unwrap_or_default();
@ -3506,7 +3506,7 @@ mod tests {
#[test]
fn phase2_e2e_go_chained_receiver_emits_field_proj() {
// Go: `c.writer.header.set(k, v)` 3-segment receiver, 2 FieldProjs.
// Go: `c.writer.header.set(k, v)`, 3-segment receiver, 2 FieldProjs.
// Chain root `c` is a function parameter so it is resolvable.
let src = b"package p\nfunc f(c *T, k string, v string) { c.writer.header.set(k, v) }\n";
let body = parse_to_first_body(
@ -3549,7 +3549,7 @@ mod tests {
#[test]
fn phase2_e2e_python_chained_receiver_emits_field_proj() {
// Python: `obj.client.session.send(p)` 3-segment receiver.
// Python: `obj.client.session.send(p)`, 3-segment receiver.
let src = b"def f(obj, p):\n obj.client.session.send(p)\n";
let body = parse_to_first_body(
src,
@ -3574,7 +3574,7 @@ mod tests {
#[test]
fn phase2_e2e_javascript_chained_receiver_emits_field_proj() {
// JS: `obj.foo.bar.baz()` 3-segment receiver.
// JS: `obj.foo.bar.baz()`, 3-segment receiver.
let src = b"function f(obj) { obj.foo.bar.baz(); }";
let body = parse_to_first_body(
src,
@ -3592,10 +3592,10 @@ mod tests {
#[test]
fn phase2_e2e_java_chained_receiver_emits_field_proj() {
// Java: `obj.config.handler.run()` 3-segment receiver chain through
// Java: `obj.config.handler.run()`, 3-segment receiver chain through
// a parameter `obj`. We avoid `this.…` because `this` is a Java
// keyword (not an identifier_node) so it isn't extracted as an
// external use — outside Phase 2's scope.
// external use, outside SSA decomposition.s scope.
let src = b"class C { void f(Object obj) { obj.config.handler.run(); } }";
let body = parse_to_first_body(
src,
@ -3620,7 +3620,7 @@ mod tests {
#[test]
fn phase2_e2e_simple_receiver_no_field_proj() {
// REGRESSION: `obj.foo()` — single-dot receiver. Phase 2 must NOT
// REGRESSION: `obj.foo()`, single-dot receiver. SSA lowering must NOT
// decompose this into a FieldProj chain (existing receiver channel
// already covers it). Verify the body has zero FieldProj ops and
// the Call's callee_text stays None.
@ -3664,7 +3664,7 @@ mod tests {
fn phase2_e2e_global_root_chain_still_emits_field_proj() {
// REGRESSION-NEGATIVE: when the chain root is a global identifier
// (`Math.foo.bar()`), the lowerer's external-var synthesis makes
// `Math` available as a synthetic Param the chain still
// `Math` available as a synthetic Param, the chain still
// decomposes, treating `Math` as the SSA receiver. This is the
// semantically correct outcome even for global-rooted chains: the
// FieldProj op precisely captures the field-access structure.
@ -3685,7 +3685,7 @@ mod tests {
#[test]
fn phase2_e2e_rust_method_call_through_field_emits_field_proj() {
// Rust: `c.mu.lock()` `c` is a function parameter, `mu` is a field,
// Rust: `c.mu.lock()`, `c` is a function parameter, `mu` is a field,
// `lock` is the method. Verifies we generate FieldProj for `mu`.
// (Rust paths like `std::env::var` use `::` and are excluded by
// the helper's complex-token check.)
@ -3782,16 +3782,11 @@ mod tests {
);
}
/// Pointer-Phase 3 / W1 end-to-end: lowering an `obj.f = rhs`
/// statement populates `SsaBody.field_writes` with the synthetic
/// base-update Assign's `(receiver, FieldId)` mapping.
///
/// W1.b: a SINGLE-write shape — `function f(obj) { obj.cache = 42 }`
/// — also populates `field_writes` because every formal gets a
/// Param op at block 0 regardless of whether it's read by the
/// body. Pre-W1.b this required two writes (the second's prior
/// reaching def came from the first synth Assign); now the first
/// write already finds the formal's Param in `var_stacks`.
/// End-to-end: lowering an `obj.f = rhs` statement populates
/// `SsaBody.field_writes` with the synthetic base-update Assign's
/// `(receiver, FieldId)` mapping. A single-write shape suffices ,
/// every formal gets a Param op at block 0 so the first write
/// finds the formal in `var_stacks`.
#[test]
fn w1_end_to_end_field_write_records_side_table_when_parent_has_prior_def() {
// Single write to `obj.cache`: the formal `obj` provides the
@ -3816,7 +3811,7 @@ mod tests {
}
}
/// W1.b: Python single `obj.cache = 42` on a formal also
/// W1.b: Python, single `obj.cache = 42` on a formal also
/// populates `field_writes` thanks to the formal Param op.
#[test]
fn w1b_single_write_records_field_write_python() {
@ -3835,7 +3830,7 @@ mod tests {
);
}
/// W1.b: Rust single `obj.cache = 42` on a method-style formal
/// W1.b: Rust, single `obj.cache = 42` on a method-style formal
/// (`fn f(obj: &mut O)`) also populates `field_writes`.
#[test]
fn w1b_single_write_records_field_write_rust() {
@ -3880,11 +3875,11 @@ mod tests {
// ─────────────────────────────────────────────────────────────────
/// Loop induction variable: `x = x + 1` inside a loop is the
/// canonical SSA challenge the body uses `x` then redefines it,
/// canonical SSA challenge, the body uses `x` then redefines it,
/// and the join with the entry definition must produce a phi that
/// distinguishes the entry value from the body's redefinition.
/// Phase 5.2 (induction var pruning) depends on this shape being
/// lowered correctly.
/// Induction-var pruning depends on this shape being lowered
/// correctly.
#[test]
fn loop_self_assignment_induction_phi_is_distinct() {
// Entry → x=0 → Loop header → [Body: use x; x = x_new] → Loop
@ -4101,7 +4096,7 @@ mod tests {
/// Variable defined ONLY in one branch of a conditional must be
/// undef on the other path. The phi at the join should include an
/// undef sentinel for the missing arm guards against the
/// undef sentinel for the missing arm, guards against the
/// renamer silently dropping the missing operand.
#[test]
fn conditional_define_only_one_arm_phi_has_undef_operand() {
@ -4137,7 +4132,7 @@ mod tests {
// Find a phi for x and verify it has 2 operands. The "undef"
// operand can manifest as a Nop-defined SsaValue or a sentinel
// both are acceptable; the invariant is that arity == preds.
//, both are acceptable; the invariant is that arity == preds.
let x_phi_ops = ssa
.blocks
.iter()

View file

@ -1,4 +1,4 @@
#[allow(dead_code)] // IR types fields used by Display impl, tests, and downstream analyses
#[allow(dead_code)] // IR types, fields used by Display impl, tests, and downstream analyses
pub mod alias;
pub mod const_prop;
pub mod copy_prop;

View file

@ -6,13 +6,13 @@
//! 1. **Param → Param field writes.** An `obj.field = val` where `obj`
//! traces back to parameter `b` and `val` traces back to parameter `a`
//! emits a `Param(a) → Param(b)` `MayAlias` edge. This captures the
//! `mutating_helper` pattern the callee mutates a shared heap cell
//! `mutating_helper` pattern, the callee mutates a shared heap cell
//! through one parameter and the caller observes the mutation through
//! its argument for that parameter.
//!
//! 2. **Param → Return aliases.** `Terminator::Return(v)` where `v`
//! traces back to a parameter emits a `Param(i) → Return` edge. This
//! captures the `returned_alias` pattern the callee returns its
//! captures the `returned_alias` pattern, the callee returns its
//! argument unchanged and the caller treats the result as aliasing the
//! input.
//!
@ -25,7 +25,7 @@
//!
//! The analysis is **flow-insensitive** and **bounded**: it does not
//! reason about path feasibility, and it stops adding edges once the
//! summary's [`MAX_ALIAS_EDGES`] cap is reached the overflow flag is
//! summary's [`MAX_ALIAS_EDGES`] cap is reached, the overflow flag is
//! the conservative fallback that callers honour.
use std::collections::{HashMap, HashSet};
@ -39,7 +39,7 @@ use super::ir::{SsaBody, SsaOp, SsaValue, Terminator};
/// Map an SSA value back to its defining instruction's op.
///
/// Local to this module the taint engine has its own `build_inst_map`
/// Local to this module, the taint engine has its own `build_inst_map`
/// that also carries receiver info we do not need, and duplicating it
/// keeps this analysis independent of that private helper's shape.
fn build_op_map(ssa: &SsaBody) -> HashMap<SsaValue, SsaOp> {
@ -73,7 +73,7 @@ struct ParamHit {
/// The `SsaOp::Param` index as lowered.
ssa_index: usize,
/// The parameter's variable name (from [`SsaInst::var_name`]). Used
/// to map back to the formal-declaration position the caller's
/// to map back to the formal-declaration position, the caller's
/// `args[i]` slot is keyed by declaration position, not by SSA
/// index, and the two can disagree when a formal parameter is
/// skipped from SSA lowering (e.g., pure-output params).
@ -83,7 +83,7 @@ struct ParamHit {
/// Walk Assign/Phi chains to find a backing `Param { index }` SSA op.
///
/// Returns the `SsaOp::Param`'s index *and* its var_name so callers can
/// resolve the formal-positional index via the name lookup table the
/// resolve the formal-positional index via the name lookup table, the
/// two indices can disagree when SSA lowering skips a formal parameter
/// (never used as a read), shifting subsequent param indices down.
fn trace_to_param_hit(
@ -144,7 +144,7 @@ fn param_hit_to_formal_index(hit: &ParamHit, params_by_name: &HashMap<String, us
/// * `"obj.list[2].name"` → `"obj"`
///
/// Used to decide whether a field-style Assign's LHS base names a
/// parameter variable we strip everything after the first separator
/// parameter variable, we strip everything after the first separator
/// and compare the remainder to the recorded param names.
fn base_of_path(name: &str) -> &str {
let dot = name.find('.');
@ -170,7 +170,7 @@ fn is_receiver_name_local(name: &str) -> bool {
/// Returns `true` the first time a qualifying allocation is found.
/// Parameter-terminated paths, `Call` ops that are not container
/// constructors, and constants that are not container literals all
/// return `false` soundly under-approximating, since the caller will
/// return `false`, soundly under-approximating, since the caller will
/// simply fall back to the existing `Param(i) → Return` / store-into-
/// heap channels when the flag is absent.
fn trace_to_fresh_alloc(
@ -225,7 +225,7 @@ fn returns_fresh_allocation(
///
/// `param_info` carries one `(param_index, param_name, param_ssa_value)`
/// tuple per formal parameter that was emitted as [`SsaOp::Param`] in the
/// lowered body. The receiver is intentionally excluded this table
/// lowered body. The receiver is intentionally excluded, this table
/// captures positional parameters only.
///
/// `formal_param_names`, when supplied, is the authoritative list of
@ -261,7 +261,7 @@ pub fn analyse_param_points_to(
// container constructor for `lang` (`ArrayList`, `dict`, …).
//
// When at least one return path matches, the callee produces a
// caller-visible fresh heap identity on that path callers
// caller-visible fresh heap identity on that path, callers
// synthesise a `HeapObjectId` keyed on the call result so later
// container operations have a stable heap cell. Traces that reach a
// parameter are handled by the edge-based `Param(i) → Return` channel
@ -278,7 +278,7 @@ pub fn analyse_param_points_to(
return summary;
}
// Build the name→positional-index map. Summary param indices are
// *positional* they match the call-site `args[i]` position, which
// *positional*, they match the call-site `args[i]` position, which
// excludes the receiver (`self`/`this`). When `formal_param_names`
// contains a leading receiver, skip it so the remaining names align
// with the SSA `SsaOp::Param { index }` convention.
@ -344,7 +344,7 @@ pub fn analyse_param_points_to(
continue;
}
if src_idx == target_idx {
// Self-alias is uninformative the caller's
// Self-alias is uninformative, the caller's
// arg-to-itself propagation is already covered by
// `param_to_return`/`param_to_sink`.
continue;
@ -532,7 +532,7 @@ mod tests {
(5usize, "capture".to_string(), SsaValue(0)),
(1usize, "b".to_string(), SsaValue(1)),
];
// formal_param_count = 2 index 5 is out of range.
// formal_param_count = 2, index 5 is out of range.
let s = analyse_param_points_to(&body, &pinfo, 2, None, None);
assert!(
s.is_empty(),
@ -570,7 +570,7 @@ mod tests {
.map(|i| (i, format!("p{i}"), SsaValue(i as u32)))
.collect();
// Only the first traced param is emitted (trace_to_param short-
// circuits on first match), so overflow is not expected we
// circuits on first match), so overflow is not expected, we
// instead verify the bounded behaviour: a single edge.
let s = analyse_param_points_to(&body, &pinfo, n as usize, None, None);
assert!(!s.overflow);

View file

@ -14,7 +14,7 @@ use smallvec::SmallVec;
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum ContainerOp {
/// Taint flows from the listed argument positions into the receiver
/// container (e.g. `arr.push(val)` val taint merges into arr).
/// container (e.g. `arr.push(val)`, val taint merges into arr).
///
/// `index_arg`: when `Some(pos)`, the argument at that logical position
/// is the container index/key. If constant-propagation proves it a
@ -27,11 +27,11 @@ pub enum ContainerOp {
/// Taint flows from the receiver container to the call's return value
/// (e.g. `arr.pop()`, `items.join('')`).
///
/// `index_arg`: same semantics as `Store::index_arg` when present and
/// `index_arg`: same semantics as `Store::index_arg`, when present and
/// provably constant, loads from `HeapSlot::Index(n)`.
Load { index_arg: Option<usize> },
/// Taint flows from the receiver container into the argument at
/// `dest_arg` i.e. the "writeback" pattern where a method writes its
/// `dest_arg`, i.e. the "writeback" pattern where a method writes its
/// decoded/loaded value into a caller-supplied destination rather than
/// returning it. Used for the Go `*.Decode(&dest)` family
/// (`json.Decoder.Decode`, `xml.Decoder.Decode`, `gob.Decoder.Decode`),
@ -121,16 +121,16 @@ fn classify_js(method: &str) -> Option<ContainerOp> {
match method {
// Array store
"push" | "unshift" => store(0),
// Map/Set store: map.set(key, value) key at 0, value at 1
// Map/Set store: map.set(key, value), key at 0, value at 1
"set" => store_indexed(1, 0),
"add" => store(0), // set.add(value)
// Array/Map load
"pop" | "shift" => load(),
"join" | "flat" | "concat" | "slice" | "toString" => load(),
// map.get(key) key at 0
// map.get(key), key at 0
"get" => load_indexed(0),
"values" | "keys" | "entries" => load(),
// Pointer-Phase 6 / W5: synthetic callees emitted by CFG
//synthetic callees emitted by CFG
// lowering for subscript reads/writes (`arr[i]`, `arr[i] = v`).
"__index_get__" => load_indexed(0),
"__index_set__" => store_indexed(1, 0),
@ -142,7 +142,7 @@ fn classify_python(method: &str) -> Option<ContainerOp> {
match method {
// List store
"append" | "extend" => store(0),
"insert" => store_indexed(1, 0), // list.insert(index, value) index at 0, value at 1
"insert" => store_indexed(1, 0), // list.insert(index, value), index at 0, value at 1
// Set store
"add" => store(0),
// Dict store
@ -150,10 +150,10 @@ fn classify_python(method: &str) -> Option<ContainerOp> {
"setdefault" => store2(0, 1), // dict.setdefault(key, default)
// List/Dict load
"pop" => load(),
"get" => load_indexed(0), // dict.get(key) / list index key/index at 0
"get" => load_indexed(0), // dict.get(key) / list index, key/index at 0
"items" | "values" | "keys" => load(),
"join" => load(),
// Pointer-Phase 6 / W5: synthetic callees emitted by CFG
//synthetic callees emitted by CFG
// lowering for subscript reads/writes (`arr[i]`, `arr[i] = v`).
"__index_get__" => load_indexed(0),
"__index_set__" => store_indexed(1, 0),
@ -165,11 +165,11 @@ fn classify_java(method: &str) -> Option<ContainerOp> {
match method {
// Collection store
"add" | "addAll" | "putAll" | "offer" | "push" => store(0),
// ArrayList.set(index, value) index at 0, value at 1
// ArrayList.set(index, value), index at 0, value at 1
"set" => store_indexed(1, 0),
// Map.put(key, value) key at 0, value at 1
// Map.put(key, value), key at 0, value at 1
"put" => store_indexed(1, 0),
// Collection load: ArrayList.get(index) index at 0
// Collection load: ArrayList.get(index), index at 0
"get" => load_indexed(0),
"poll" | "peek" | "remove" | "pop" => load(),
"stream" | "toArray" | "iterator" => load(),
@ -203,7 +203,7 @@ fn classify_go(method: &str, callee: &str) -> Option<ContainerOp> {
// method-call form has the bytes carried via the receiver, not arg 0,
// so it lines up with the writeback contract just like `Decode`.
"Decode" | "Unmarshal" => Some(ContainerOp::Writeback { dest_arg: 0 }),
// Pointer-Phase 6 / W5: synthetic callees emitted by CFG
//synthetic callees emitted by CFG
// lowering for Go index_expression reads/writes (`arr[i]`,
// `m[k] = v`).
"__index_get__" => load_indexed(0),
@ -222,7 +222,7 @@ fn classify_ruby(method: &str) -> Option<ContainerOp> {
fn classify_php(method: &str) -> Option<ContainerOp> {
match method {
"array_push" => store(1), // array_push(&$arr, $val) arr is arg 0, val is arg 1
"array_push" => store(1), // array_push(&$arr, $val), arr is arg 0, val is arg 1
"array_pop" | "array_shift" | "current" | "next" | "reset" => load(),
_ => None,
}
@ -232,11 +232,11 @@ fn classify_cpp(method: &str) -> Option<ContainerOp> {
match method {
// Mutating container operations.
// `assign` overwrites the container's contents with the argument
// sequence modeled as Store so the receiver inherits the argument
// sequence, modeled as Store so the receiver inherits the argument
// taint, matching the runtime "the values now live inside this
// container" semantics shared with `push_back`/`emplace_back`.
"push_back" | "emplace_back" | "insert" | "emplace" | "push" | "assign" => store(0),
// Map/unordered_map insertion: `m.insert_or_assign(k, v)` value at 1.
// Map/unordered_map insertion: `m.insert_or_assign(k, v)`, value at 1.
"insert_or_assign" => store_indexed(1, 0),
// Read-only container observers. `find`/`count` return iterators or
// counts that carry the container's value taint when queried with a
@ -255,7 +255,7 @@ fn classify_rust(method: &str) -> Option<ContainerOp> {
match method {
"push" | "insert" | "extend" => store(0),
"pop" | "first" | "last" | "iter" | "remove" => load(),
// vec.get(index) index at 0
// vec.get(index), index at 0
"get" => load_indexed(0),
_ => None,
}
@ -304,7 +304,7 @@ mod tests {
}
// CVE Hunt Session 2 (Owncast CVE-2023-3188 / CVE-2024-31450 family):
// Go `*.Decode(&dest)` is the canonical streaming-decoder writeback
// Go `*.Decode(&dest)` is the canonical streaming-decoder writeback ,
// `json.NewDecoder(r.Body).Decode(&dest)`, `xml.NewDecoder(r).Decode(&out)`,
// `gob.NewDecoder(buf).Decode(&v)`. The decoder receiver carries the
// source taint and the destination is arg 0; the writeback rule is the
@ -394,7 +394,7 @@ mod tests {
}
}
// ── C++ Phase 1 additions ──────────────────────────────────────
// ── C++ extras ──────────────────────────────────────
#[test]
fn cpp_push_back_is_store() {
@ -413,7 +413,7 @@ mod tests {
#[test]
fn cpp_assign_is_store() {
// vector::assign(args) overwrites the container's contents the
// vector::assign(args) overwrites the container's contents, the
// receiver inherits argument taint just like push_back.
let op = classify_container_op("v.assign", Lang::Cpp);
assert!(matches!(op, Some(ContainerOp::Store { .. })));
@ -421,7 +421,7 @@ mod tests {
#[test]
fn cpp_insert_or_assign_indexes_value() {
// map::insert_or_assign(key, value) value is at arg 1, key at arg 0.
// map::insert_or_assign(key, value), value is at arg 1, key at arg 0.
match classify_container_op("m.insert_or_assign", Lang::Cpp) {
Some(ContainerOp::Store {
value_args,
@ -456,7 +456,7 @@ mod tests {
}
/// W5: synthetic `__index_get__` is recognised as an indexed load
/// in JS/TS, Python, and Go driving the index_arg=0 path so a
/// in JS/TS, Python, and Go, driving the index_arg=0 path so a
/// constant-key subscript read flows through `HeapSlot::Index(n)`.
#[test]
fn synth_index_get_classified_as_indexed_load_js_py_go() {
@ -471,7 +471,7 @@ mod tests {
}
/// W5: synthetic `__index_set__` is recognised as an indexed store
/// in JS/TS, Python, and Go value at arg 1, index at arg 0.
/// in JS/TS, Python, and Go, value at arg 1, index at arg 0.
#[test]
fn synth_index_set_classified_as_indexed_store_js_py_go() {
for lang in [Lang::JavaScript, Lang::TypeScript, Lang::Python, Lang::Go] {

View file

@ -12,7 +12,7 @@
//! where every insert's *value* slot is a syntactic string literal and the
//! final lookup is dereffed via a literal fallback (`.unwrap_or(LIT)`). The
//! result `cmd` is then provably bounded to the finite set
//! `{V1, V2, …, "safe"}`, regardless of what `k` carries taint-flavour or
//! `{V1, V2, …, "safe"}`, regardless of what `k` carries, taint-flavour or
//! otherwise. Downstream sink suppression consumes this finite set to
//! clear SHELL/FILE/SQL injection findings whose payload is proved to be
//! metacharacter-free.
@ -24,7 +24,7 @@
//! (e.g. `"table.get(key).copied().unwrap_or"` for `table.get(key).copied()
//! .unwrap_or("safe")`) and whose `receiver` is the root identifier's SSA
//! value. We therefore do not need to walk SSA `.copied()` / `.unwrap_or`
//! instructions as separate hops pattern-matching on the callee text is
//! instructions as separate hops, pattern-matching on the callee text is
//! the source of truth. String-literal arguments that the callee text
//! elides (e.g. the fallback `"safe"`) are read from the CFG node's
//! `arg_string_literals`, populated during CFG construction.
@ -33,7 +33,7 @@
//! literal-valued inserts, no escape beyond recognised mutate/read methods.
//! Any deviation (dynamic insert, callee not in the allow-list, map used as
//! a plain argument, map returned, map joined across a phi) invalidates the
//! candidate. Missed detection is safe it just falls through to existing
//! candidate. Missed detection is safe, it just falls through to existing
//! behaviour.
use std::collections::{HashMap, HashSet};
@ -73,15 +73,15 @@ fn is_rust_map_constructor(callee: &str) -> bool {
/// Classification of a Call whose receiver is a candidate map.
#[derive(Clone, Debug, PartialEq, Eq)]
enum MapUse {
/// `{var}.insert(K, V)` value contributes to the finite domain.
/// `{var}.insert(K, V)`, value contributes to the finite domain.
Insert,
/// `{var}.get(K)[.copied()|.cloned()|.as_deref()|.as_ref()]*.unwrap_or`
/// lookup result is bounded by the inserted values plus the fallback
///, lookup result is bounded by the inserted values plus the fallback
/// literal on the CFG node.
StaticLookup,
/// Whitelisted read-only method (no reference leak).
ReadOnly,
/// Anything else invalidates the map candidate.
/// Anything else, invalidates the map candidate.
Escape,
}
@ -138,7 +138,7 @@ fn scan_past_balanced_parens(s: &str) -> Option<&str> {
/// Return `true` when `s` is a sequence of zero or more identity chain
/// methods (`.copied()`, `.cloned()`, `.as_deref()`, `.as_ref()`) followed
/// by `.unwrap_or` (and nothing else). The trailing arg list of
/// `.unwrap_or` is elided in the callee text it appears in the CFG node's
/// `.unwrap_or` is elided in the callee text, it appears in the CFG node's
/// `arg_string_literals` instead.
fn is_identity_chain_ending_in_unwrap_or(mut s: &str) -> bool {
const IDENTS: &[&str] = &[".copied()", ".cloned()", ".as_deref()", ".as_ref()"];
@ -171,7 +171,7 @@ fn resolve_alias(v: SsaValue, aliases: &HashMap<SsaValue, SsaValue>) -> SsaValue
cur
}
/// Run the analysis. Bails out immediately for non-Rust bodies the current
/// Run the analysis. Bails out immediately for non-Rust bodies, the current
/// pattern set only models Rust `std::collections::HashMap`.
pub fn analyze(
body: &SsaBody,
@ -382,7 +382,7 @@ mod tests {
#[test]
fn classify_static_lookup_without_identity_chain() {
// `.unwrap_or` directly after `.get(...)` also qualifies Rust
// `.unwrap_or` directly after `.get(...)` also qualifies, Rust
// `HashMap::get` returns `Option<&V>`, so `.unwrap_or(&"safe")` is
// syntactically valid and equally bounded.
assert_eq!(
@ -401,7 +401,7 @@ mod tests {
#[test]
fn classify_rejects_unknown_terminator() {
// `.unwrap_or_else(|| …)` is not modelled closure can return anything.
// `.unwrap_or_else(|| …)` is not modelled, closure can return anything.
assert_eq!(
classify_map_use("t.get(k).copied().unwrap_or_else", "t"),
MapUse::Escape
@ -414,7 +414,7 @@ mod tests {
#[test]
fn classify_rejects_other_receiver() {
// `other.insert` does not belong to `table` receiver mismatch.
// `other.insert` does not belong to `table`, receiver mismatch.
assert_eq!(classify_map_use("other.insert", "table"), MapUse::Escape);
}

View file

@ -25,23 +25,21 @@ pub enum TypeKind {
FileHandle,
Url,
HttpClient,
/// A local, in-memory collection (HashMap, HashSet, Vec,
/// BTreeMap, …). Consumed by the auth analysis sink gate so method
/// calls on variables of this type (`map.insert(...)`) are treated
/// as in-memory bookkeeping rather than cross-tenant sinks. Has no
/// `label_prefix` — it never participates in label-based callee
/// A local, in-memory collection (HashMap, HashSet, Vec, etc.).
/// The auth sink gate uses this so calls like `map.insert(...)`
/// are treated as bookkeeping rather than cross-tenant sinks. No
/// `label_prefix`, never participates in label-based callee
/// resolution.
LocalCollection,
/// Phase 6: a framework-injected DTO body whose field types are
/// known. Populated only when a parameter is recognised as a typed
/// extractor by a Phase 1-2 matcher AND the DTO class / struct /
/// Pydantic model is resolvable in the current scan scope.
/// Strictly additive — when no DTO definition is found, callers
/// fall through to today's pre-Phase-6 behaviour.
/// A framework-injected DTO body whose field types are known.
/// Populated when a parameter is recognised as a typed extractor and
/// the DTO class / struct / Pydantic model is resolvable in scope.
/// Strictly additive, without a DTO definition, callers fall back
/// to name-only resolution.
Dto(DtoFields),
}
/// Phase 6: structural carrier for a recognised DTO type. Maps
/// structural carrier for a recognised DTO type. Maps
/// declared field names to their inferred [`TypeKind`]. Nested DTOs
/// use [`TypeKind::Dto`] recursively.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
@ -82,19 +80,11 @@ impl TypeKind {
}
}
/// Container name used by the typed call-graph devirtualisation
/// (`docs/typed-call-graph-prompt.md`, Phase 2).
///
/// Returns the class / impl / module string under which an SSA
/// receiver value of this type would be looked up in
/// [`crate::callgraph::ClassMethodIndex`]. Mirrors
/// [`Self::label_prefix`] for the security-relevant abstract
/// types (HttpClient → `"HttpClient"`, DatabaseConnection →
/// `"DatabaseConnection"`, etc.) and additionally returns the DTO
/// class name for [`TypeKind::Dto`] receivers.
///
/// Scalar / unknown types return `None` — they have no defining
/// container and would not narrow a method-call edge meaningfully.
/// Container name used by typed call-graph devirtualisation ,
/// the class / impl / module under which a receiver of this type
/// would be looked up. Returns the DTO class name for `Dto`
/// receivers, label prefixes for known abstract types, `None` for
/// scalars.
pub fn container_name(&self) -> Option<String> {
if let Some(prefix) = self.label_prefix() {
return Some(prefix.to_string());
@ -105,7 +95,7 @@ impl TypeKind {
None
}
/// Phase 6: convenience accessor for the inner `DtoFields` if this
/// convenience accessor for the inner `DtoFields` if this
/// type is a recognised DTO.
pub fn as_dto(&self) -> Option<&DtoFields> {
match self {
@ -146,7 +136,7 @@ impl TypeFact {
TypeFact { kind, nullable }
}
/// Phase 6: factory used by the field-access propagation rule.
/// factory used by the field-access propagation rule.
pub(crate) fn from_dto_field(receiver: &TypeKind, field: &str) -> Option<Self> {
let dto = receiver.as_dto()?;
let kind = dto.get(field)?.clone();
@ -190,10 +180,10 @@ impl TypeFactResult {
///
/// Suppression policy:
/// * [`TypeKind::Int`] (and float, treated as numeric): suppresses
/// `SQL_QUERY`, `FILE_IO`, `SHELL_ESCAPE`, `HTML_ESCAPE`, `SSRF`
/// `SQL_QUERY`, `FILE_IO`, `SHELL_ESCAPE`, `HTML_ESCAPE`, `SSRF` ,
/// numeric values cannot carry the metacharacters required to drive
/// any of these injection classes.
/// * [`TypeKind::Bool`]: suppresses every type-suppressible bit
/// * [`TypeKind::Bool`]: suppresses every type-suppressible bit ,
/// `true`/`false` cannot carry a payload of any kind.
pub fn is_type_safe_for_sink(
values: &[SsaValue],
@ -245,6 +235,18 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option<TypeKind> {
Lang::JavaScript | Lang::TypeScript => match suffix {
"URL" => Some(TypeKind::Url),
"Request" | "XMLHttpRequest" => Some(TypeKind::HttpClient),
// JS built-in collection constructors. `new Map()` / `new Set()`
// / `new WeakMap()` / `new WeakSet()` / `new Array()` produce
// in-memory collections; downstream `m.get(k)` / `m.set(k, v)`
// / `s.add(x)` / `s.has(x)` / `arr.find(p)` are container ops,
// not data-layer reads. Without this mapping the bare verb
// dispatch in `auth_analysis::config::classify_sink_class`
// matches the `get` / `find` / `add` read/mutation indicators
// and over-fires `js.auth.missing_ownership_check` on every
// Map lookup in pure data-manipulation code (excalidraw's
// `elementsMap.get(id)`, `origIdToDuplicateId.get(...)`,
// `groupIdMapForOperation.set(...)` shapes).
"Map" | "Set" | "WeakMap" | "WeakSet" | "Array" => Some(TypeKind::LocalCollection),
_ => None,
},
Lang::Python => {
@ -334,10 +336,9 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option<TypeKind> {
Some(TypeKind::DatabaseConnection)
} else if is_rust_local_collection_constructor(base) {
// Rust std/indexmap/smallvec/dashmap collection
// constructors map to a generic "local collection" type so
// the auth analysis sink gate can recognise
// `let x = factory_fn(); x.insert(..)` even when the RHS
// isn't a syntactic constructor call.
// constructors map to a generic "local collection" type
// so the auth sink gate recognises
// `let x = factory_fn(); x.insert(..)`.
Some(TypeKind::LocalCollection)
} else {
None
@ -421,6 +422,15 @@ fn is_rust_local_collection_constructor(base: &str) -> bool {
"FxHashSet",
"DashMap",
"DashSet",
// `roaring` crate, RoaringBitmap / RoaringTreemap are
// in-memory bitset / bitmap containers (set-of-u32 /
// set-of-u64). Used heavily by indexing systems
// (meilisearch's index-scheduler) for `task_ids`,
// `docids`, and similar local-collection bookkeeping.
// Mutations (`insert` / `remove` / `clear`) are container
// ops, not data-layer writes.
"RoaringBitmap",
"RoaringTreemap",
];
const VERBS: &[&str] = &[
"new",
@ -460,11 +470,73 @@ pub fn is_int_producing_callee(callee: &str) -> bool {
| "Atoi" | "ParseInt" | "ParseFloat" // Go
| "intval" | "floatval" // PHP
| "to_i" | "to_f" // Ruby
| "parse" // Rust: `.parse::<N>()` / `.parse().unwrap()` conservative
| "parse" // Rust: `.parse::<N>()` / `.parse().unwrap()`, conservative
// (most Rust .parse() calls target numeric types)
)
}
/// Polarity hint for a generic input-validator callee.
///
/// Most validation idioms route attacker-controlled input through a
/// helper whose result the caller branches on:
///
/// ```text
/// const err = validateUrlSsrf(child.webhookUrl); // ErrorReturning
/// if (err) throw new Error(err); // false branch → success
///
/// if (isValid(input)) { use(input); } // BooleanTrueIsValid
/// // true branch → success
/// ```
///
/// Without modeling this pattern, a one-statement rewrite of a
/// `validate(x); if(x) ...` guard hides the semantic equivalence to
/// `if (validate(x)) ...` (already classified as ValidationCall). The
/// classifier discriminates only on the textual head of the bare call
///, strict-additive: callees that don't match any pattern return
/// `None` and the engine falls through to its existing behaviour.
///
/// Motivated by Novu CVE GHSA-4x48-cgf9-q33f
/// (`const ssrfError = await validateUrlSsrf(child.webhookUrl); if (ssrfError) throw`).
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum InputValidatorPolarity {
/// Returns boolean, truthy means "valid".
BooleanTrueIsValid,
/// Returns null/undefined on success, error/message on failure ,
/// truthy means "rejected".
ErrorReturning,
}
pub fn classify_input_validator_callee(callee: &str) -> Option<InputValidatorPolarity> {
let base = peel_identity_suffix(callee);
let suffix = base.rsplit(['.', ':']).next().unwrap_or(&base);
let lower = suffix.to_ascii_lowercase();
// Boolean returners, name typically reads as a predicate
// (`isValid…`, `is_valid_…`, `is_safe…`, `has_valid…`). Truthy
// result → input is valid → TRUE branch carries the validation.
if lower.starts_with("isvalid")
|| lower.starts_with("is_valid")
|| lower.starts_with("issafe")
|| lower.starts_with("is_safe")
|| lower.starts_with("hasvalid")
|| lower.starts_with("has_valid")
{
return Some(InputValidatorPolarity::BooleanTrueIsValid);
}
// Error-returning validators, name reads as a verb whose return
// value carries the error description. `validateXxx`, `verifyXxx`
// are the dominant idioms; we deliberately do NOT match `check…`
// here because a name like `checkPermissions` overlaps with auth
// checks (different semantic) and the suppression payoff isn't
// worth the precision risk.
if lower.starts_with("validate") || lower.starts_with("verify") {
return Some(InputValidatorPolarity::ErrorReturning);
}
None
}
/// Analyze types for all SSA values.
///
/// Uses constant propagation results to seed types from known constants,
@ -571,7 +643,7 @@ pub fn analyze_types_with_param_types(
| BinOp::Gt
| BinOp::GtEq,
) => TypeFact::from_kind(TypeKind::Int),
// Add could be string concatenation defer to operand types
// Add could be string concatenation, defer to operand types
_ => TypeFact::unknown(),
}
}
@ -587,7 +659,7 @@ pub fn analyze_types_with_param_types(
Some(tk) => TypeFact::from_kind(tk.clone()),
None => TypeFact::unknown(),
},
// Undef contributes no type information phi joins
// Undef contributes no type information, phi joins
// pick up the type from the other (defined) operand.
SsaOp::Undef => TypeFact::unknown(),
};
@ -603,7 +675,7 @@ pub fn analyze_types_with_param_types(
for block in &body.blocks {
// Identity-preserving method calls: pass through receiver's type.
// E.g. `Connection::open(p).unwrap()` the `.unwrap()` call's type
// E.g. `Connection::open(p).unwrap()`, the `.unwrap()` call's type
// fact should mirror the receiver (Result<Connection>). Only applies
// when the current fact is still Unknown so explicit constructor
// mappings win.
@ -618,7 +690,7 @@ pub fn analyze_types_with_param_types(
continue;
}
// A numeric-length accessor pinned by the first pass is
// load-bearing for sink suppression do not let identity-
// load-bearing for sink suppression, do not let identity-
// method receiver propagation overwrite the Int fact.
if cfg
.node_weight(inst.cfg_node)
@ -644,7 +716,7 @@ pub fn analyze_types_with_param_types(
}
}
// Phase 6.3: FieldProj receiver-driven type narrowing. When
// FieldProj receiver-driven type narrowing. When
// SSA lowering decomposed `a.b.c()` into a FieldProj chain,
// intermediate FieldProj insts default to `projected_type =
// None`. If the receiver value carries a Dto fact and the
@ -701,7 +773,7 @@ pub fn analyze_types_with_param_types(
// Copy assignments and binary arithmetic
for inst in &block.body {
// Preserve the Int fact pinned by the numeric-length-access
// detector in the first pass copy propagation would replace
// detector in the first pass, copy propagation would replace
// it with the receiver's (usually Unknown) type and defeat the
// whole point of the accessor rule.
if cfg
@ -712,11 +784,11 @@ pub fn analyze_types_with_param_types(
}
if let SsaOp::Assign(uses) = &inst.op {
if uses.len() == 1 {
// Phase 6.3: when the RHS is a single member-access
// when the RHS is a single member-access
// expression and the receiver value carries a
// `TypeKind::Dto(fields)` fact, route the assignment's
// type to the field's declared `TypeKind`. Strictly
// additive falls through to copy-prop when the
// additive, falls through to copy-prop when the
// receiver isn't a DTO or the field isn't recorded.
let dto_field_fact = cfg
.node_weight(inst.cfg_node)
@ -777,7 +849,7 @@ pub fn analyze_types_with_param_types(
/// Used for `instanceof` resolution and type-qualified method dispatch.
pub struct TypeHierarchy;
/// (subtype, &[supertypes]) sink-relevant framework types only.
/// (subtype, &[supertypes]), sink-relevant framework types only.
static JAVA_HIERARCHY: &[(&str, &[&str])] = &[
("HttpServletResponse", &["ServletResponse"]),
("HttpServletRequest", &["ServletRequest"]),
@ -853,7 +925,7 @@ impl TypeHierarchy {
///
/// Conservative: unknown interfaces → `true` (could satisfy).
/// Only [`definitely_not`](GoInterfaceTable::definitely_not) is used for
/// suppression it returns `true` only when the type provably cannot
/// suppression, it returns `true` only when the type provably cannot
/// implement the interface.
pub struct GoInterfaceTable;
@ -1147,8 +1219,8 @@ mod tests {
assert_eq!(result.get_type(SsaValue(99)), None);
}
/// Phase 4: Int-typed values must suppress every type-suppressible
/// cap including the freshly-added `SSRF` bit. Numeric IDs
/// Int-typed values must suppress every type-suppressible
/// cap, including the freshly-added `SSRF` bit. Numeric IDs
/// cannot rewrite a URL host, cannot form path traversal sequences,
/// cannot carry SQL/HTML/shell metacharacters.
#[test]
@ -1183,7 +1255,7 @@ mod tests {
));
}
/// Phase 4: Bool-typed values are even safer than ints — `true` /
/// Bool-typed values are even safer than ints, `true` /
/// `false` cannot carry any payload and must suppress every
/// type-suppressible cap.
#[test]
@ -1207,7 +1279,7 @@ mod tests {
}
}
/// String-typed values must NOT trigger suppression they are the
/// String-typed values must NOT trigger suppression, they are the
/// canonical injection carrier. Regression guard so a future
/// change to `is_type_safe_for_sink` does not silently silence
/// real String-payload findings.
@ -1349,8 +1421,8 @@ mod tests {
}
}
/// Audit A3 (companion): mixed-type operand list only one Int
/// among operands of unknown type must NOT suppress. The
/// Audit A3 (companion): mixed-type operand list, only one Int
/// among operands of unknown type, must NOT suppress. The
/// suppression rule requires every operand to be payload-incompatible.
#[test]
fn mixed_type_operands_do_not_suppress() {
@ -1366,7 +1438,7 @@ mod tests {
));
}
/// Phase 3: Param values seeded from `param_types` must surface
/// Param values seeded from `param_types` must surface
/// the right TypeKind for downstream sink suppression. An out-of-
/// range index falls back to Unknown (the pre-Phase-3 default).
#[test]
@ -1590,6 +1662,47 @@ mod tests {
assert_eq!(constructor_type(Lang::Cpp, "printf"), None);
}
#[test]
fn constructor_type_javascript_typescript_local_collections() {
// `new Map()` / `new Set()` / `new WeakMap()` / `new WeakSet()` /
// `new Array()` produce in-memory collections. Excalidraw's
// `elementsMap.get(id)` shape (which dominates the
// `js.auth.missing_ownership_check` cluster on JS data-manipulation
// libraries) is suppressed once the receiver type is known.
for lang in [Lang::JavaScript, Lang::TypeScript] {
assert_eq!(
constructor_type(lang, "Map"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
constructor_type(lang, "Set"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
constructor_type(lang, "WeakMap"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
constructor_type(lang, "WeakSet"),
Some(TypeKind::LocalCollection)
);
assert_eq!(
constructor_type(lang, "Array"),
Some(TypeKind::LocalCollection)
);
// Existing pre-fix mappings still resolve.
assert_eq!(constructor_type(lang, "URL"), Some(TypeKind::Url));
assert_eq!(
constructor_type(lang, "XMLHttpRequest"),
Some(TypeKind::HttpClient)
);
// Negative: unrelated identifiers stay None.
assert_eq!(constructor_type(lang, "Object"), None);
assert_eq!(constructor_type(lang, "Promise"), None);
assert_eq!(constructor_type(lang, "Foo"), None);
}
}
#[test]
fn constructor_type_ruby() {
// HttpClient
@ -1680,7 +1793,7 @@ mod tests {
constructor_type(Lang::Rust, "diesel::SqliteConnection::establish"),
Some(TypeKind::DatabaseConnection)
);
// Bare `Connection::open` is accepted Rust idiom
// Bare `Connection::open` is accepted, Rust idiom
// `use rusqlite::Connection; Connection::open(…)` is common, and the
// scanner sees the unqualified callee text after import resolution.
// Accepting this matches the benchmark fixture `rs-sqli-001`.
@ -1938,9 +2051,9 @@ mod tests {
);
}
// ── Phase 6 DTO field-level taint ─────────────────────────────────────
// ── DTO field-level taint ─────────────────────────────────────────────
/// Phase 6: `TypeFact::from_dto_field` returns `Some(field_kind)`
/// `TypeFact::from_dto_field` returns `Some(field_kind)`
/// for a DTO receiver whose `fields` map contains the requested
/// field, and `None` otherwise.
#[test]
@ -1956,7 +2069,7 @@ mod tests {
assert!(TypeFact::from_dto_field(&recv, "missing").is_none());
}
/// Phase 6: a non-DTO receiver kind never produces a field fact
/// a non-DTO receiver kind never produces a field fact ,
/// `from_dto_field` falls through to the legacy copy-prop path.
#[test]
fn dto_field_lookup_on_non_dto_returns_none() {
@ -1974,10 +2087,9 @@ mod tests {
}
}
/// Phase 6: nested DTO — the parent DTO's field type is
/// `TypeKind::Dto`, and `from_dto_field` returns that nested DTO
/// fact directly. Phase 6.3 callers can recurse into the inner
/// fields by following the returned receiver's `as_dto()` chain.
/// Nested DTO, the parent DTO's field type is `TypeKind::Dto`,
/// and `from_dto_field` returns that nested DTO fact directly.
/// Callers can recurse via `as_dto()`.
#[test]
fn dto_field_lookup_supports_nested_dto() {
let mut inner = DtoFields::new("Address");
@ -1990,7 +2102,7 @@ mod tests {
assert_eq!(addr.kind, TypeKind::Dto(inner));
}
/// Phase 6: an empty DTO (class declared but with no inferred
/// an empty DTO (class declared but with no inferred
/// fields) never resolves field reads. Documents the safe-fallback
/// invariant so the legacy path runs when class fields couldn't be
/// classified.
@ -2000,9 +2112,8 @@ mod tests {
assert!(TypeFact::from_dto_field(&recv, "anything").is_none());
}
/// Phase 6: an `Int`-typed field in a DTO survives the
/// type-suppression matrix exactly the same way a freestanding
/// `Int` does — sanity-check the bridge between Phase 6 and Phase 4.
/// An `Int`-typed DTO field survives the type-suppression matrix
/// the same way a freestanding `Int` does.
#[test]
fn dto_int_field_suppresses_sql_query_via_matrix() {
use crate::labels::Cap;