//! Per-function summaries for cross-file taint analysis. //! //! [`FuncSummary`] describes a function's boundary behaviour: which parameters //! flow to sinks, which sources it reads, whether it propagates taint from //! arguments to its return value, and what capabilities it strips. Summaries //! are serialized to SQLite in pass 1 and merged into [`GlobalSummaries`] //! before pass 2 begins. //! //! [`crate::summary::ssa_summary::SsaFuncSummary`] is a richer summary //! derived from the SSA taint engine and takes precedence over [`FuncSummary`] //! during call resolution. `GlobalSummaries::ssa_by_key` stores SSA summaries //! keyed by [`FuncKey`]; `GlobalSummaries::by_name` holds the fallback //! name-keyed map for cases where an exact key is not found. //! //! Same-name collisions across files are merged conservatively: capabilities //! are unioned and booleans are OR-ed so no true positive is silently dropped. pub mod points_to; pub mod ssa_summary; use crate::labels::Cap; use crate::summary::ssa_summary::SsaFuncSummary; use crate::symbol::{FuncKey, FuncKind, Lang, normalize_namespace}; use rustc_hash::FxHashMap; use serde::{Deserialize, Deserializer, Serialize}; use smallvec::SmallVec; use std::collections::{BTreeMap, HashMap}; use std::hash::{Hash, Hasher}; // ── Sink site (primary sink-location attribution) ─────────────────────── /// A single dangerous-instruction site inside a function's body. /// Pairs a [`Cap`] with the source location of the consuming /// instruction so cross-file findings can attribute to the callee /// rather than the caller call-site. /// /// `from_chain` distinguishes two flavours of recorded site: /// * `false`, the site was resolved via the body-local locator span, /// i.e. it points at a sink instruction in the function's own body. /// * `true`, the site was promoted from a deeper callee through /// `event.primary_sink_site`, i.e. this function's summary carries /// a chain-hop marker for a sink several frames down. /// /// Pass-2 emission gates promotion of a site into `Finding.primary_location` /// on `from_chain || file_rel != caller_file_rel`: same-file single-hop /// helpers keep call-site emission (matching benchmark and real-world /// fixture calibration), multi-hop chains and cross-file callees surface /// the deep sink line. See "Multi-hop intra-file sink attribution gap" /// in deferred.md for the design tradeoff. #[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)] pub struct SinkSite { #[serde(default, skip_serializing_if = "String::is_empty")] pub file_rel: String, #[serde(default, skip_serializing_if = "is_zero_u32")] pub line: u32, #[serde(default, skip_serializing_if = "is_zero_u32")] pub col: u32, #[serde(default, skip_serializing_if = "String::is_empty")] pub snippet: String, pub cap: Cap, /// True when this site was promoted from a deeper callee's summary /// (`event.primary_sink_site` chain-hop), false when recorded from /// the function's own locator span. See struct docs. #[serde(default, skip_serializing_if = "is_false")] pub from_chain: bool, } impl SinkSite { /// Dedup key: two sites with the same `(file_rel, line, col, cap)` /// describe the same consumption and collapse on merge. `from_chain` /// is intentionally excluded, the upgrade rule in [`union_sink_sites`] /// takes over when two sites with different `from_chain` collide. pub(crate) fn dedup_key(&self) -> (&str, u32, u32, u32) { (self.file_rel.as_str(), self.line, self.col, self.cap.bits()) } /// Build a cap-only site for extraction paths with no tree/bytes /// context (pass-2 transient summaries). pub fn cap_only(cap: Cap) -> Self { Self { file_rel: String::new(), line: 0, col: 0, snippet: String::new(), cap, from_chain: false, } } } fn is_false(b: &bool) -> bool { !*b } /// Tree/bytes context for resolving a CFG span to a [`SinkSite`]. /// Threaded as `Option<&Locator>` so extraction paths without tree /// access can pass `None` cheaply. pub struct SinkSiteLocator<'a> { pub tree: &'a tree_sitter::Tree, pub bytes: &'a [u8], pub file_rel: &'a str, } impl<'a> SinkSiteLocator<'a> { /// Resolve a span to a [`SinkSite`]. Coordinates fall back to /// `(0, 0)` and the snippet to empty when out of range. pub fn site_for_span(&self, span: (usize, usize), cap: Cap) -> SinkSite { let byte = span.0; let point = self .tree .root_node() .descendant_for_byte_range(byte, byte) .map(|n| n.start_position()) .unwrap_or(tree_sitter::Point { row: 0, column: 0 }); let snippet = line_snippet(self.bytes, byte).unwrap_or_default(); SinkSite { file_rel: self.file_rel.to_string(), line: (point.row + 1) as u32, col: (point.column + 1) as u32, snippet, cap, from_chain: false, } } } pub(crate) use crate::utils::snippet::line_snippet; /// Union two `SmallVec<[SinkSite; 1]>` lists with `(file_rel, line, col, /// cap)` dedup. Preserves insertion order of `existing` then appends any /// new sites from `incoming` not already present. When two sites with the /// same dedup key collide, `from_chain=true` wins, so a chain-hop marker is /// never lost when a same-file locator span happens to share coordinates. pub(crate) fn union_sink_sites(existing: &mut SmallVec<[SinkSite; 1]>, incoming: &[SinkSite]) { for site in incoming { let key = site.dedup_key(); if let Some(ex) = existing.iter_mut().find(|s| s.dedup_key() == key) { if site.from_chain && !ex.from_chain { ex.from_chain = true; } } else { existing.push(site.clone()); } } } /// Union two `Vec<(usize, SmallVec<[SinkSite; 1]>)>` lists keyed by /// parameter index. Each parameter keeps its own deduped site list. pub(crate) fn union_param_sink_sites( existing: &mut Vec<(usize, SmallVec<[SinkSite; 1]>)>, incoming: &[(usize, SmallVec<[SinkSite; 1]>)], ) { for (idx, sites) in incoming { if let Some((_, ex)) = existing.iter_mut().find(|(i, _)| *i == *idx) { union_sink_sites(ex, sites); } else { existing.push((*idx, sites.clone())); } } } /// Top bit of [`FuncKey::disambig`] reserved for synthetic discriminators /// minted by [`GlobalSummaries`] when an identity collision is detected /// between structurally incompatible summaries. /// /// Real disambigs come from `tree_sitter::Node::start_byte` (see /// `cfg.rs:fn_disambig`), which is a byte offset into the source file. /// Source files in practice are far below 2 GiB, so bit 31 of a real /// disambig is always zero, setting it marks a value as synthetic and /// keeps it in a disjoint namespace from byte-offset disambigs. const SYNTHETIC_DISAMBIG_BIT: u32 = 0x8000_0000; // ── Callee site metadata ──────────────────────────────────────────────── /// Richer per-call-site metadata preserved in a function's summary. /// /// Replaces the legacy `Vec` callee list. Carries enough structure /// to disambiguate same-name overloads and method calls at resolution time /// without having to re-parse the raw callee string. /// /// * `name`, the raw callee text as it appeared in source /// (`"obj.method"`, `"env::var"`, `"helper"`). Preserved for diagnostics. /// * `arity`, number of positional arguments at the call site. `None` /// when splats / keyword-args / rest-params make the count unreliable. /// * `receiver`, structured receiver identifier for method calls /// (e.g. `"obj"` in `obj.method()`). Carries the root receiver for /// chained calls; `None` for non-method or complex receivers. /// * `qualifier`, the segment immediately before the leaf for non-method /// qualified calls (e.g. `"env"` in `env::var`). Extracted once at CFG /// time rather than re-parsed downstream. /// * `ordinal`, the per-function call ordinal matching /// `CallMeta.call_ordinal`, allowing cross-file consumers to address a /// specific call site rather than just a callee name. #[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Hash)] pub struct CalleeSite { pub name: String, #[serde(default, skip_serializing_if = "Option::is_none")] pub arity: Option, #[serde(default, skip_serializing_if = "Option::is_none")] pub receiver: Option, #[serde(default, skip_serializing_if = "Option::is_none")] pub qualifier: Option, #[serde(default, skip_serializing_if = "is_zero_u32")] pub ordinal: u32, } fn is_zero_u32(n: &u32) -> bool { *n == 0 } impl CalleeSite { /// Construct a bare call-site reference from a name, with no other metadata. pub fn bare(name: impl Into) -> Self { Self { name: name.into(), ..Default::default() } } } impl From for CalleeSite { fn from(name: String) -> Self { Self { name, ..Default::default() } } } impl From<&str> for CalleeSite { fn from(name: &str) -> Self { Self { name: name.to_string(), ..Default::default() } } } /// Deserialize a `Vec` while tolerating the legacy /// on-disk form where callees were a plain array of strings. /// /// Accepts: /// * `[{"name": "foo", "arity": 1, ...}, ...]` ← current structured form /// * `["foo", "bar", ...]` ← legacy string form fn deserialize_callee_sites<'de, D>(de: D) -> Result, D::Error> where D: Deserializer<'de>, { #[derive(Deserialize)] #[serde(untagged)] enum Entry { Structured(CalleeSite), Bare(String), } let raw: Vec = Vec::deserialize(de)?; Ok(raw .into_iter() .map(|e| match e { Entry::Structured(s) => s, Entry::Bare(name) => CalleeSite::bare(name), }) .collect()) } /// Serialisable summary of a single function's taint behaviour. /// /// One of these is produced per function during **pass 1** of a scan and /// persisted to the `function_summaries` SQLite table. During **pass 2** the /// full set of summaries across every file is loaded into memory so the taint /// engine can resolve cross‑file calls. /// /// Design notes /// ──────────── /// * **All three cap fields are independent.** A function can simultaneously /// act as a source (introduces fresh taint), a sanitizer (cleans certain /// bits), and a sink (passes tainted data to a dangerous operation). /// The old code picked a single `DataLabel` which lost information. /// /// * **`propagating_params`** captures per‑argument pass‑through behaviour: /// which parameter indices (0‑based) flow through to the return value. /// This is essential for chains like `let y = transform(tainted_x); sink(y);`. /// The legacy boolean `propagates_taint` is kept for deserialising old JSON. /// /// * **`callees`** drive call‑graph construction in `callgraph.rs`, which /// yields the topological order and SCC batches used between pass 1 and /// pass 2 (see `scan::run_topo_batches` and `scc_file_batches_with_metadata`). /// /// * **`tainted_sink_params`** marks which parameter *positions* flow to /// internal sinks and is consumed by SSA callee resolution /// (`ssa_transfer::mod.rs` `resolve_callee`) to build the per-parameter /// `param_to_sink` list, so caller-side sink propagation fires on the /// specific argument positions rather than the whole call. #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct FuncSummary { /// Function name as it appears in the source (`my_func`, not the full path). pub name: String, /// Absolute path of the file that defines this function. pub file_path: String, /// Language slug (`"rust"`, `"javascript"`, …). pub lang: String, // ── Signature information ──────────────────────────────────────────── /// Total number of parameters (including `self`/`&self` for methods). pub param_count: usize, /// Parameter names in declaration order. pub param_names: Vec, // ── Taint behaviour ────────────────────────────────────────────────── // Stored as raw `u32` so serde doesn't need to know about `bitflags`. /// Caps this function **introduces**, i.e. the return value carries /// freshly‑tainted data even if no argument was tainted. pub source_caps: u32, /// Caps this function **cleans**, passing tainted data through this /// function strips the corresponding bits. pub sanitizer_caps: u32, /// Caps this function **consumes unsafely**, calling it with tainted /// arguments that still carry these bits is a finding. pub sink_caps: u32, /// Which parameter indices (0‑based) flow through to the return value. #[serde(default)] pub propagating_params: Vec, /// Legacy field, kept only for deserialising old JSON from SQLite. /// New code should use `propagating_params` instead. #[serde(default, skip_serializing)] pub propagates_taint: bool, /// Indices of parameters that flow to internal sinks (0‑based). pub tainted_sink_params: Vec, /// Per-parameter [`SinkSite`] records, mirrors /// [`SsaFuncSummary::param_to_sink`] so the coarse legacy summary also /// carries primary sink-location attribution through the two-pass /// architecture. Empty when the extractor lacked tree access. #[serde(default, skip_serializing_if = "Vec::is_empty")] pub param_to_sink: Vec<(usize, SmallVec<[SinkSite; 1]>)>, /// Per-call-site metadata for every function/method/macro invoked /// inside this body (`CalleeSite`). Carries arity, receiver, /// qualifier, and call ordinal so downstream resolution does not have /// to re-parse the raw callee string. /// /// A custom deserializer tolerates legacy on-disk rows whose callees /// field was a plain `Vec`; those are lifted to /// `CalleeSite { name, .. }` with no additional metadata. #[serde(default, deserialize_with = "deserialize_callee_sites")] pub callees: Vec, // ── Identity discriminators ────────────────────────────────────────── /// Enclosing container path (class / impl / module / outer function), /// segments joined with `::`. Empty for free top-level functions. #[serde(default)] pub container: String, /// Numeric discriminator for same-name siblings (closure byte offset, /// nested-function occurrence index). `None` when no sibling collision. #[serde(default)] pub disambig: Option, /// Structural role of this definition. Defaults to `Function` when /// deserialising legacy JSON. #[serde(default)] pub kind: FuncKind, // ── Rust-specific module-resolution metadata ──────────────────────── /// Crate-relative module path for this function's defining file /// (e.g. `"auth::token"` for `src/auth/token.rs`). Only populated /// when `lang == "rust"`. Used by the call graph to resolve /// `use`-imported callees to their fully-qualified module. /// /// `None` for non-Rust files and for Rust files outside a recognised /// `src/` tree (tests, examples, build scripts). #[serde(default, skip_serializing_if = "Option::is_none")] pub module_path: Option, /// Per-file `use`-alias map for the defining Rust source. /// /// Maps the local identifier introduced by a `use` declaration to its /// fully qualified path (`"validate"` → `"crate::auth::token::validate"`). /// Carried on every summary for the file even though it is per-file /// information; the duplication keeps the persistence schema simple /// and lets resolution operate purely off the caller's summary. #[serde(default, skip_serializing_if = "Option::is_none")] pub rust_use_map: Option>, /// Fully qualified prefixes of any wildcard `use ...::*` imports in /// the defining Rust source. Stored separately because they expand /// the candidate space at resolution time rather than naming a single /// alias. #[serde(default, skip_serializing_if = "Option::is_none")] pub rust_wildcards: Option>, /// Per-file class / trait / interface hierarchy edges captured at /// CFG-construction time. Each entry is /// `(sub_container, super_container)` after language-specific /// normalisation: /// /// * Java `class X extends Y` → `(X, Y)`; `implements I, J` → `(X, I)`, `(X, J)` /// * Rust `impl Trait for Type` → `(Type, Trait)` /// * TypeScript `class X extends Y implements I` → `(X, Y)`, `(X, I)` /// * Python `class X(A, B)` → `(X, A)`, `(X, B)` /// * PHP `class X extends Y implements I` → `(X, Y)`, `(X, I)` /// * Ruby `class X < Y` → `(X, Y)` /// * C++ `class X : public Y` → `(X, Y)` /// /// Empty for files with no declared inheritance / impl /// relationships and for Go (which uses implicit interface /// satisfaction, not computed). /// /// **Per-file duplication.** Every `FuncSummary` produced from a /// given file carries the **same** `hierarchy_edges` vector so the /// information survives summary-by-summary persistence to SQLite. /// `merge_summaries` deduplicates downstream when building /// [`crate::callgraph::TypeHierarchyIndex`]. #[serde(default, skip_serializing_if = "Vec::is_empty")] pub hierarchy_edges: Vec<(String, String)>, /// Phase-10 Next.js entry-point classification. When `Some(_)`, /// the function is treated as an externally-driven entry point /// whose parameters are seeded as `TaintOrigin::Source` at SSA /// entry, mirroring the way an HTTP request handler's formals are /// adversary-controlled by default. `None` for ordinary /// helpers — pass-2 keeps its existing baseline-subtraction /// semantics. #[serde(default, skip_serializing_if = "Option::is_none")] pub entry_kind: Option, } // ── Cap conversion helpers ────────────────────────────────────────────── impl FuncSummary { #[inline] pub fn source_caps(&self) -> Cap { Cap::from_bits_truncate(self.source_caps) } #[inline] pub fn sanitizer_caps(&self) -> Cap { Cap::from_bits_truncate(self.sanitizer_caps) } #[inline] pub fn sink_caps(&self) -> Cap { Cap::from_bits_truncate(self.sink_caps) } /// Returns `true` when any parameter flows to the return value. /// Also returns `true` for legacy summaries with `propagates_taint: true` /// but empty `propagating_params` (backward compat). pub fn propagates_any(&self) -> bool { !self.propagating_params.is_empty() || self.propagates_taint } /// Build a [`FuncKey`] from this summary, normalizing the namespace /// relative to `scan_root`. pub fn func_key(&self, scan_root: Option<&str>) -> FuncKey { FuncKey { lang: Lang::from_slug(&self.lang).unwrap_or(Lang::Rust), namespace: normalize_namespace(&self.file_path, scan_root), container: self.container.clone(), name: self.name.clone(), arity: Some(self.param_count), disambig: self.disambig, kind: self.kind, } } /// Phase-04 [`FuncKey`] builder that consults a project-wide /// [`crate::resolve::ModuleGraph`]. /// /// When the file producing this summary lies inside a discovered /// package, `namespace` becomes `"@scope/name::src/file.ts"`; /// otherwise the result matches [`Self::func_key`] exactly. /// Phase 04 only adds the helper, no resolution call site uses /// it. Phase 10 switches the JS/TS pass-1 path to call this /// instead of [`Self::func_key`]. pub fn func_key_with_resolver( &self, scan_root: Option<&str>, module_graph: Option<&crate::resolve::ModuleGraph>, ) -> FuncKey { FuncKey { lang: Lang::from_slug(&self.lang).unwrap_or(Lang::Rust), namespace: crate::symbol::namespace_with_package( &self.file_path, scan_root, module_graph, ), container: self.container.clone(), name: self.name.clone(), arity: Some(self.param_count), disambig: self.disambig, kind: self.kind, } } } // ── Callee resolution ──────────────────────────────────────────────────── /// Result of resolving a bare callee name to a [`FuncKey`]. /// /// Three-valued: the call graph builder and taint engine need to distinguish /// "no candidates at all" from "multiple candidates, can't pick one". #[derive(Debug, Clone, PartialEq, Eq)] pub enum CalleeResolution { /// Exactly one candidate matched. Resolved(FuncKey), /// No candidates found at all. NotFound, /// Multiple candidates, ambiguous, cannot pick one. Ambiguous(Vec), } /// Structured query describing a call site. /// /// Carries every hint needed to pick the right callee *by qualified identity* /// first and only fall back on bare-leaf lookup as a last resort. The old /// entry points (`resolve_callee_key`, `resolve_callee_key_with_container`) /// are now thin wrappers that build a `CalleeQuery` with partial information. /// /// Hint categories, ordered from strongest to weakest: /// /// * `receiver_type`, authoritative class/impl/module name (e.g. from /// type inference or a `use ...` resolution). When set, the resolver /// *requires* the callee's container to equal this name and refuses to /// fall back to a leaf-name collision if the qualified lookup misses. /// * `namespace_qualifier`, syntactic qualifier parsed from the callee /// (e.g. `"env"` in `env::var`, `"http"` in `http.Get`). Treated as a /// container hint but not authoritative: a miss falls through. /// * `receiver_var`, syntactic receiver variable name (e.g. `"obj"` in /// `obj.method()`). Soft hint, used only to tie-break ambiguity. /// * `caller_container`, caller's own enclosing container, used to /// resolve bare self-calls inside a class/impl body. /// /// `arity` is a hard filter, when `Some`, every candidate whose arity /// differs is excluded from consideration. #[derive(Debug, Clone)] pub struct CalleeQuery<'a> { /// Leaf (unqualified) callee name, e.g. `"process"` for `OrderService::process`. pub name: &'a str, pub caller_lang: Lang, /// Project-relative namespace (file path) of the caller. Used for /// same-namespace disambiguation when qualified hints miss. pub caller_namespace: &'a str, /// The caller's own container (`FuncKey::container`), for resolving /// bare `self`/intra-class calls without a receiver. pub caller_container: Option<&'a str>, /// Authoritative receiver class/impl name. Populated from type facts /// (`TypeKind::label_prefix`) or from Rust use-map resolution. pub receiver_type: Option<&'a str>, /// Syntactic namespace qualifier (non-authoritative). For /// `std::env::var` in Rust the caller passes `"env"`; for `http.Get` /// in Go, `"http"`. Left `None` for purely bare calls. pub namespace_qualifier: Option<&'a str>, /// Syntactic receiver variable name. Used only as a tie-breaker, a /// variable name is a weak proxy for a class name. pub receiver_var: Option<&'a str>, /// Positional-argument count at the call site. Hard filter when set. pub arity: Option, } impl<'a> CalleeQuery<'a> { /// Whether this query carries any qualified identity hint stronger than /// a bare leaf name. Used by the resolver to decide whether an /// unresolved qualified match should still fall through to leaf lookup /// (no hints → fall through; authoritative hints → refuse to guess). pub fn has_qualified_hint(&self) -> bool { self.receiver_type.is_some() || self.namespace_qualifier.is_some() || self.caller_container.is_some_and(|s| !s.is_empty()) } } // ── Lookup map used by the taint engine ───────────────────────────────── /// A merged view of all function summaries keyed by qualified [`FuncKey`]. /// /// Functions are partitioned by language + namespace + name + arity. Two /// functions with the same bare name but different languages or namespaces /// are stored separately, no implicit cross-language merging occurs. /// /// A secondary index `(Lang, name)` supports fast lookup by language + name /// for same-language resolution in the taint engine. #[derive(Default)] pub struct GlobalSummaries { /// FxHashMap (rustc_hash) replaces stdlib SipHash. FuncKey carries 3 /// String fields, so any HashMap operation walks ≥30 bytes through the /// hasher; FxHash is ~5x faster than SipHash on this workload. Seed /// is fixed (no DoS hardening), which is fine for an in-process index /// keyed by static program-derived names. by_key: FxHashMap, /// Bare leaf-name index, kept for compatibility with callers that only /// see an unqualified call string. A single name may map to many keys /// across containers / files / arities. by_lang_name: FxHashMap<(Lang, String), Vec>, /// Container-qualified index: keyed on `"{container}::{name}"` (or just /// `name` for free functions). Used to resolve calls when the call-site /// can supply a receiver / container hint (e.g. `OrderService::process`). by_lang_qualified: FxHashMap<(Lang, String), Vec>, /// Rust-only secondary index keyed on `(module_path, name)`. /// /// Populated whenever a Rust [`FuncSummary`] is inserted with a /// `module_path` set. Used by use-map driven resolution to look up /// candidates by their crate-relative module rather than their /// filesystem path. Same name / module / arity overloads land on the /// same vector, arity narrowing happens at resolution time. by_rust_module: FxHashMap<(String, String), Vec>, /// Precise SSA-derived per-parameter summaries, keyed by `FuncKey`. /// These take precedence over `FuncSummary` during callee resolution. ssa_by_key: HashMap, /// Sibling index over [`Self::ssa_by_key`] keyed by /// `(lang, namespace, name)`. Populated in lockstep with `ssa_by_key` /// (every `insert_ssa` / `merge` adds the key). Used by the /// cross-package SSA resolution path (step 0.7 in /// `taint::ssa_transfer::resolve_callee`) to avoid an /// `O(|ssa_by_key|)` linear scan per cross-package call site: /// the resolver looks up the candidate `Vec` and narrows /// to a single hit by container / arity / disambig. Strictly /// additive: when the index is empty (e.g. tests that never insert /// SSA summaries) the resolver falls back to its existing flat /// paths. /// /// Note: SSA summaries are append-only on `GlobalSummaries` (no /// remove/clear methods), so the index never needs invalidation. /// Synthetic-disambig probing in /// [`Self::reconcile_ssa_summary_key`] only mutates the inserted /// key's `disambig` field, never the `(lang, namespace, name)` /// triple, so the index value still points at every relevant /// `FuncKey` after reconciliation. ssa_by_lang_ns_name: HashMap<(Lang, String, String), Vec>, /// Cross-file callee bodies for interprocedural symbolic execution. /// Keyed by `FuncKey` (same identity model as SSA summaries). bodies_by_key: HashMap, /// Per-function auth-check summaries for cross-file helper lifting. /// Keyed by `FuncKey` so a call-site resolver can go from a resolved /// callee name to the helper's auth-check signature. Populated in /// pass 1 and consumed by /// [`crate::auth_analysis::run_auth_analysis`] during pass 2. auth_by_key: HashMap, /// Per-Python-file router declarations + `include_router` edges, /// keyed by `module_id_for_storage(file_path)` (basename without /// `.py`, or `parent_dir::__init__` for `__init__.py`). Populated /// during pass 1 and consumed by /// [`Self::resolve_cross_file_router_deps`] at pass 2 entry to lift /// FastAPI router-level `dependencies=[Security(...)]` declared in a /// parent file (`__init__.py` calling /// `.include_router(.router, ...)`) onto the bare /// child router declared in another file — closing the airflow /// execution-API auth-recognition gap on routes attached to bare /// child routers. router_facts_by_module: HashMap, /// Per-file Phase-09 cross-package import maps, keyed by file /// namespace (scan-root-relative path, the same form /// [`FuncKey::namespace`] uses). Populated in pass 1 from each /// file's [`crate::cfg::FileCfg::resolved_imports`] and consumed by /// `inline_analyse_callee` when the inlined callee body's own /// `cross_package_imports` Arc is empty (i.e. the body was loaded /// from SQLite, where the field is `#[serde(skip)]`). Closes the /// indexed-mode parity gap on transitive cross-package IPA inside /// inlined frames. cross_package_imports_by_namespace: HashMap>>, /// Type hierarchy index for runtime virtual-dispatch fan-out. /// /// Installed by [`Self::install_hierarchy`] after pass 1 from the /// merged `FuncSummary::hierarchy_edges` vectors. Consumed by /// [`Self::resolve_callee_widened`] during pass 2 so the taint /// engine sees every concrete implementer of a method when the /// receiver is statically typed as a super-class / trait / /// interface, recovering the dispatch precision that today's /// single-result [`Self::resolve_callee`] discards. /// /// `None` until installed: every consumer treats `None` as /// "fall through to today's bare resolution", so the index is /// strictly additive. hierarchy: Option, } impl GlobalSummaries { pub fn new() -> Self { Self::default() } /// Walk a proposed insertion key, bumping the synthetic disambig /// until either (a) the key is unoccupied, or (b) the entry found at /// that key is compatible with the incoming summary (safe to merge). /// /// Identity collisions are extraordinarily rare in practice (they /// require two structurally distinct functions to land on the same /// non-synthetic key, e.g. both with `disambig: None`). The loop /// bound is defensive, if synthetic probing still collides after /// 1024 attempts we fall through and let the caller merge, which /// degrades gracefully to the old behaviour rather than looping /// forever. fn reconcile_func_summary_key(&self, mut key: FuncKey, summary: &FuncSummary) -> FuncKey { let mut probe: u32 = 0; loop { match self.by_key.get(&key) { Some(existing) if !summaries_compatible(existing, summary) => { let synth = synthesize_disambig(summary).wrapping_add(probe); key.disambig = Some(SYNTHETIC_DISAMBIG_BIT | (synth & !SYNTHETIC_DISAMBIG_BIT)); probe = probe.wrapping_add(1); if probe >= 1024 { tracing::warn!( "summary identity collision probe gave up after 1024 attempts; \ falling back to union-merge for {}", key ); return key; } } _ => return key, } } } /// SSA-summary variant of [`Self::reconcile_func_summary_key`]. /// /// Distinctness signals for SSA summaries are weaker than for /// coarse `FuncSummary`s, the summary itself carries no explicit /// `param_count`, only references to parameter indices. We combine: /// /// * **Key arity fit**, any parameter index referenced by the new /// summary that exceeds `key.arity` is a structural mismatch. /// * **Existing-entry compare**, if an entry already lives at /// this key and it disagrees on the set of referenced parameter /// indices, the two cannot both describe the same function. fn reconcile_ssa_summary_key(&self, mut key: FuncKey, summary: &SsaFuncSummary) -> FuncKey { let mut probe: u32 = 0; loop { let conflict = match self.ssa_by_key.get(&key) { Some(existing) => !ssa_summaries_compatible(existing, summary, key.arity), None => !ssa_summary_fits_arity(summary, key.arity), }; if !conflict { return key; } let synth = synthesize_ssa_disambig(summary).wrapping_add(probe); key.disambig = Some(SYNTHETIC_DISAMBIG_BIT | (synth & !SYNTHETIC_DISAMBIG_BIT)); probe = probe.wrapping_add(1); if probe >= 1024 { tracing::warn!( "SSA summary identity collision probe gave up after 1024 attempts \ for {}", key ); return key; } } } /// Body variant of [`Self::reconcile_func_summary_key`]. /// /// `CalleeSsaBody` carries an explicit `param_count`, which must /// agree with both `key.arity` and any co-located body's /// `param_count`. A mismatch is a hard collision. fn reconcile_body_key( &self, mut key: FuncKey, body: &crate::taint::ssa_transfer::CalleeSsaBody, ) -> FuncKey { let mut probe: u32 = 0; loop { let conflict = match self.bodies_by_key.get(&key) { Some(existing) => existing.param_count != body.param_count, None => match key.arity { Some(a) => a != body.param_count, None => false, }, }; if !conflict { return key; } let synth = (body.param_count as u32) .wrapping_mul(0x9E37_79B9) .wrapping_add(probe); key.disambig = Some(SYNTHETIC_DISAMBIG_BIT | (synth & !SYNTHETIC_DISAMBIG_BIT)); probe = probe.wrapping_add(1); if probe >= 1024 { tracing::warn!( "SSA body identity collision probe gave up after 1024 attempts for {}", key ); return key; } } } /// Insert or merge a summary. If an exact `FuncKey` match exists and /// the two summaries describe the same function, merge conservatively /// (OR caps/booleans, union params/callees). /// /// `FuncKey` is structurally precise *when every producer populates /// `disambig`*. Legacy on-disk JSON, interop configs, DB rows written /// by older versions, and any code path that keeps `disambig: None` /// can produce two keys that hash-equal even though they belong to /// structurally distinct functions (e.g. different `param_count`, /// `kind`, `container`, or `param_names`). Silently unioning those /// would leak security-relevant caps across unrelated functions and /// drop one of the two summaries entirely. /// /// We therefore inspect the existing entry first. If the new summary /// is not `summaries_compatible` with it, we mint a synthetic /// disambig (top bit set to stay disjoint from byte-offset disambigs) /// and retry the insert under the fresh key so *both* functions are /// preserved. pub fn insert(&mut self, key: FuncKey, summary: FuncSummary) { let key = self.reconcile_func_summary_key(key, &summary); let lang = key.lang; let name = key.name.clone(); let qualified = key.qualified_name(); let rust_module = if lang == Lang::Rust { summary.module_path.clone() } else { None }; self.by_key .entry(key.clone()) .and_modify(|existing| { existing.source_caps |= summary.source_caps; existing.sanitizer_caps |= summary.sanitizer_caps; existing.sink_caps |= summary.sink_caps; existing.propagates_taint |= summary.propagates_taint; for &idx in &summary.propagating_params { if !existing.propagating_params.contains(&idx) { existing.propagating_params.push(idx); } } for &idx in &summary.tainted_sink_params { if !existing.tainted_sink_params.contains(&idx) { existing.tainted_sink_params.push(idx); } } union_param_sink_sites(&mut existing.param_to_sink, &summary.param_to_sink); for c in &summary.callees { if !existing.callees.iter().any(|e| { e.name == c.name && e.arity == c.arity && e.receiver == c.receiver && e.qualifier == c.qualifier && e.ordinal == c.ordinal }) { existing.callees.push(c.clone()); } } }) .or_insert(summary); let keys = self.by_lang_name.entry((lang, name)).or_default(); if !keys.contains(&key) { keys.push(key.clone()); } let q_keys = self.by_lang_qualified.entry((lang, qualified)).or_default(); if !q_keys.contains(&key) { q_keys.push(key.clone()); } if let Some(mp) = rust_module { let mk = self .by_rust_module .entry((mp, key.name.clone())) .or_default(); if !mk.contains(&key) { mk.push(key); } } } /// Exact lookup by fully-qualified key. pub fn get(&self, key: &FuncKey) -> Option<&FuncSummary> { self.by_key.get(key) } /// Interop / external-edge lookup: tolerant of `disambig` being `None`. /// /// Interop edges originate outside the source code (user-specified JSON, /// language-bridge config) and cannot know a callee's internal byte-offset /// disambiguator. When the query key has `disambig = None` we fall back to /// scanning for a single match on `(lang, namespace, container, name, /// arity, kind)`. If exactly one matches it is returned; otherwise we /// return `None` to preserve determinism (ambiguity is treated as unknown). pub fn get_for_interop(&self, key: &FuncKey) -> Option<&FuncSummary> { if let Some(hit) = self.by_key.get(key) { return Some(hit); } if key.disambig.is_some() { return None; } let mut matches = self.by_key.iter().filter(|(k, _)| { k.lang == key.lang && k.namespace == key.namespace && k.container == key.container && k.name == key.name && k.arity == key.arity && k.kind == key.kind }); let first = matches.next()?; if matches.next().is_some() { None } else { Some(first.1) } } /// All same-language matches for a bare function name. pub fn lookup_same_lang(&self, lang: Lang, name: &str) -> Vec<(&FuncKey, &FuncSummary)> { self.by_lang_name .get(&(lang, name.to_string())) .map(|keys| { keys.iter() .filter_map(|k| self.by_key.get(k).map(|v| (k, v))) .collect() }) .unwrap_or_default() } /// Rust-only lookup by `(module_path, name)`. /// /// Returns every candidate that was inserted with a matching module /// path. Arity filtering is applied by the caller so that the index /// stays ambiguity-aware (two overloads legitimately share a module /// path + name and only differ in arity). pub fn lookup_rust_module( &self, module_path: &str, name: &str, ) -> Vec<(&FuncKey, &FuncSummary)> { self.by_rust_module .get(&(module_path.to_string(), name.to_string())) .map(|keys| { keys.iter() .filter_map(|k| self.by_key.get(k).map(|v| (k, v))) .collect() }) .unwrap_or_default() } /// Container-qualified lookup. `qualified` should be /// `"Container::name"` (use [`FuncKey::qualified_name`]) or `"name"`. pub fn lookup_qualified(&self, lang: Lang, qualified: &str) -> Vec<(&FuncKey, &FuncSummary)> { self.by_lang_qualified .get(&(lang, qualified.to_string())) .map(|keys| { keys.iter() .filter_map(|k| self.by_key.get(k).map(|v| (k, v))) .collect() }) .unwrap_or_default() } /// Merge another `GlobalSummaries` into this one (for parallel fold/reduce). pub fn merge(&mut self, other: GlobalSummaries) { // `insert` rebuilds every secondary index (by_lang_name, by_lang_qualified, // by_rust_module) from the summary itself, so we do not need to copy // `other.by_rust_module` explicitly, draining `other.by_key` is enough. for (key, summary) in other.by_key { self.insert(key, summary); } // SSA summaries: last-writer-wins (exact-key replacement, no unioning) for (key, ssa_sum) in other.ssa_by_key { self.index_ssa_key(&key); self.ssa_by_key.insert(key, ssa_sum); } // Cross-file bodies: last-writer-wins for (key, body) in other.bodies_by_key { self.bodies_by_key.insert(key, body); } // Auth summaries: last-writer-wins (exact-key replacement) for (key, auth_sum) in other.auth_by_key { self.auth_by_key.insert(key, auth_sum); } // Router facts: last-writer-wins per (module_id) key. Re-analysing // a file produces a fresh snapshot of its router declarations + edges. for (module_id, facts) in other.router_facts_by_module { self.router_facts_by_module.insert(module_id, facts); } // Cross-package imports: last-writer-wins per namespace. for (ns, map) in other.cross_package_imports_by_namespace { self.cross_package_imports_by_namespace.insert(ns, map); } // Hierarchy index: invalidate after a merge so the next consumer // sees a freshly-built view that includes `other`'s edges. The // alternative, point-merging two indexes, is racy when the // same `(lang, super)` key carries different sub-orderings in // each input; rebuild is O(n) over `by_key.iter()` and is the // single source of truth. self.hierarchy = None; } /// Insert an SSA summary. /// /// Per-function refinement is expressed via last-writer-wins for /// *compatible* summaries: re-analysing the same function body with /// more precise seeds yields a strictly better summary, and the /// caller genuinely wants the new one to replace the old. /// /// When the existing entry is **incompatible** with the incoming /// one, the key's `arity` disagrees with the new summary's referenced /// parameter indices, or the two summaries would describe different /// functions, we synthesize a disambig so both are kept. Silent /// replacement in that case would drop one function's cross-file /// taint signal entirely, which the caller cannot recover. /// /// Before reconciliation, drop any parameter-index reference at or /// above `key.arity`. Such indices come from synthetic SSA `Param` /// ops emitted by scoped lowering for **external captures** (free /// identifiers like `this`, module imports, or unresolved method /// names) and are useful for *intra-file* pass-2 analysis (the /// caller's implicit-uses argument group at the same index aligns /// with the synthetic Param) but never for cross-file consumers, /// which key off the FuncKey arity exclusively. Without the trim, /// `ssa_summary_fits_arity` would reject the summary and /// `reconcile_ssa_summary_key` would synthesise a disambig that /// uncouples the SSA FuncKey from the matching FuncSummary FuncKey /// (audit gap A.2.1.G1 , /// `project_typed_callgraph_audit_gap_ssa_disambig.md`). pub fn insert_ssa(&mut self, key: FuncKey, summary: SsaFuncSummary) { // The summary may reference a parameter index ≥ `key.arity` when // scoped SSA lowering synthesised `Param` ops for **external // captures** (free identifiers like `this`, module imports, // unresolved method names), see audit gap A.2.1.G1 // (`project_typed_callgraph_audit_gap_ssa_disambig.md`). These // synthetic refs are useful inside the file they were extracted // in (caller implicit-uses align with the synthetic Param) and // stay useful when resolved cross-file by name. But they trip // [`ssa_summary_fits_arity`] inside // [`reconcile_ssa_summary_key`], forcing a synthetic disambig // that uncouples the SSA FuncKey from the FuncSummary FuncKey //, `summaries.get_ssa(caller_key)` (consuming // `typed_call_receivers` at the FuncSummary-aligned key) would // miss. // // Resolution rule (applies only when `summary` does not fit // arity): // // * **No existing entry, or existing entry also has out-of-range // refs**, keep the untrimmed summary at the original key, // bypassing disambig synthesis. Resolution finds the entry // under the FuncSummary's own disambig with its full // per-param signal (closures, lambdas, captured-var sinks). The "existing also // has out-of-range refs" branch covers the iterative-rescan // case where round 2's incoming summary lands on top of round // 1's already-installed copy of the same function. // // * **Existing entry fits arity (legit) but new doesn't**, fall // back to the disambig synthesis. This preserves the // `insert_ssa_arity_overflow_rekeys` invariant: a structurally // incompatible incoming summary (different function sharing // name + container + arity, with param refs at indices that // don't even exist in the legitimate function) cannot // dethrone the existing entry by silent overwrite. Both // summaries survive, the existing one at the original key, // the new one at the synthesised disambig. let key = if key.arity.is_some() && !ssa_summary_fits_arity(&summary, key.arity) { let existing_also_overflows = self .ssa_by_key .get(&key) .is_some_and(|existing| !ssa_summary_fits_arity(existing, key.arity)); let existing_present = self.ssa_by_key.contains_key(&key); if !existing_present || existing_also_overflows { key } else { self.reconcile_ssa_summary_key(key, &summary) } } else { self.reconcile_ssa_summary_key(key, &summary) }; self.index_ssa_key(&key); self.ssa_by_key.insert(key, summary); } /// Push `key` onto the secondary `(lang, namespace, name)` index. /// Idempotent: a re-insert at the same triple does not duplicate /// the key in the candidate vector. fn index_ssa_key(&mut self, key: &FuncKey) { let triple = (key.lang, key.namespace.clone(), key.name.clone()); let bucket = self.ssa_by_lang_ns_name.entry(triple).or_default(); if !bucket.contains(key) { bucket.push(key.clone()); } } /// Look up SSA summary `FuncKey`s by `(lang, namespace, name)`. /// Returns `&[]` when no SSA summary at that triple has been /// stored. Used by the cross-package resolution path so the /// step-0.7 narrowing can iterate only the candidate set rather /// than every persisted SSA key. pub fn ssa_keys_by_qualified(&self, lang: Lang, namespace: &str, name: &str) -> &[FuncKey] { // Borrow against (Lang, &str, &str) avoiding allocation by // looking up with a tuple of owned Strings only when present. // HashMap requires equivalent hash; (Lang, String, String) // hashes the same as the equivalent tuple of equivalent // values, so we construct a small owned key for the probe. // Profile-light: this runs once per cross-package callee and // both string clones are short (namespace path + leaf name). let probe = (lang, namespace.to_string(), name.to_string()); self.ssa_by_lang_ns_name .get(&probe) .map(|v| v.as_slice()) .unwrap_or(&[]) } /// Exact lookup of an SSA summary by fully-qualified key. pub fn get_ssa(&self, key: &FuncKey) -> Option<&SsaFuncSummary> { self.ssa_by_key.get(key) } /// Insert an `AuthCheckSummary` for cross-file helper lifting. /// /// Last-writer-wins: re-analysing a file produces a fresh summary /// that fully replaces any earlier entry. No compatibility /// reconciliation is needed because `AuthCheckSummary` carries no /// identity-sensitive signal beyond the key itself. pub fn insert_auth( &mut self, key: FuncKey, summary: crate::auth_analysis::model::AuthCheckSummary, ) { self.auth_by_key.insert(key, summary); } /// Exact lookup of an `AuthCheckSummary` by fully-qualified key. pub fn get_auth( &self, key: &FuncKey, ) -> Option<&crate::auth_analysis::model::AuthCheckSummary> { self.auth_by_key.get(key) } /// Direct access to the auth-summary map. `None` when empty so /// callers can distinguish "no cross-file auth summaries loaded" /// from "some were loaded but none matched the call site". pub fn auth_by_key( &self, ) -> Option<&HashMap> { if self.auth_by_key.is_empty() { None } else { Some(&self.auth_by_key) } } /// Count of cross-file auth summaries currently loaded. pub fn auth_len(&self) -> usize { self.auth_by_key.len() } /// Insert a per-file `PerFileRouterFacts` snapshot. Last-writer-wins /// per `module_id` key — re-analysing a file produces a fresh /// snapshot of its router declarations and `include_router` edges. pub fn insert_router_facts( &mut self, module_id: String, facts: crate::auth_analysis::router_facts::PerFileRouterFacts, ) { self.router_facts_by_module.insert(module_id, facts); } /// Resolve cross-file router-level deps for the file identified by /// `child_module_id`. Walks every other file's persisted /// `RouterIncludeEdge` list, finds edges whose `child_module_id` /// matches, and accumulates the parent file's /// `local_router_deps[parent_var]` against `child_var` — producing /// a ` → Vec<(CallSite, scoped_security)>` map ready to /// merge into the active file's /// `AuthorizationModel.cross_file_router_deps`. /// /// Single-hop only. Transitive lifts (`grandparent.include_router(parent); /// parent.include_router(child)`) are not currently resolved — the /// airflow shape that motivated this fix is single-hop, and adding /// transitive resolution is a follow-up that would also need to /// model the bare-identifier `outer.include_router(inner_router)` /// case which the extractor presently skips. /// /// Returns an empty map when `child_module_id` matches no edges or /// when the index is empty. pub fn resolve_cross_file_router_deps( &self, child_module_id: &str, ) -> HashMap> { let mut out: HashMap> = HashMap::new(); if self.router_facts_by_module.is_empty() { return out; } for facts in self.router_facts_by_module.values() { for edge in &facts.include_router_edges { if edge.child_module_id != child_module_id { continue; } // Look up the parent's deps in the SAME file's // local_router_deps map (parent declarations and the // include_router edge live in the same file). let Some(parent_deps) = facts.local_router_deps.get(&edge.parent_var) else { continue; }; if parent_deps.is_empty() { continue; } let entry = out.entry(edge.child_var.clone()).or_default(); for dep in parent_deps { // Dedup by (callee name, scoped flag) so multiple // parents declaring the same dep don't double-fire. let already = entry .iter() .any(|(call, scoped)| call.name == dep.0.name && *scoped == dep.1); if !already { entry.push(dep.clone()); } } } } out } /// Count of files that contributed router facts to the index. /// Exposed for `tracing::debug!` observability. pub fn router_facts_len(&self) -> usize { self.router_facts_by_module.len() } /// Insert a per-file Phase-09 cross-package import map. Last-writer-wins /// per namespace key — re-analysing a file produces a fresh snapshot /// of its `(local_name → FuncKey)` resolutions. pub fn insert_cross_package_imports( &mut self, namespace: String, map: std::sync::Arc>, ) { if map.is_empty() { return; } self.cross_package_imports_by_namespace .insert(namespace, map); } /// Look up a per-file cross-package import map by file namespace. /// Used by [`crate::taint::ssa_transfer`]'s inline-analysis frame to /// recover the callee body's own import view when the body was loaded /// from SQLite (where the Arc on `CalleeSsaBody` is stripped by /// `#[serde(skip)]`). pub fn get_cross_package_imports( &self, namespace: &str, ) -> Option<&std::sync::Arc>> { self.cross_package_imports_by_namespace.get(namespace) } /// Count of files that contributed cross-package import maps. pub fn cross_package_imports_len(&self) -> usize { self.cross_package_imports_by_namespace.len() } /// Insert a cross-file callee body. /// /// See [`insert_ssa`](Self::insert_ssa) for the identity-safety rule. /// Bodies additionally carry `param_count`, giving a hard structural /// signal: a collision between bodies with different `param_count` /// cannot be the same function and is always rekeyed. pub fn insert_body(&mut self, key: FuncKey, body: crate::taint::ssa_transfer::CalleeSsaBody) { let key = self.reconcile_body_key(key, &body); self.bodies_by_key.insert(key, body); } /// Exact lookup of a cross-file callee body by fully-qualified key. pub fn get_body(&self, key: &FuncKey) -> Option<&crate::taint::ssa_transfer::CalleeSsaBody> { self.bodies_by_key.get(key) } /// Direct access to the cross-file body map. /// /// Returns `None` when no cross-file bodies were loaded (empty map). /// The taint engine uses this to thread bodies through /// [`crate::taint::ssa_transfer::SsaTaintTransfer::cross_file_bodies`] /// and `resolve_callee` for context-sensitive cross-file inline /// analysis. pub fn bodies_by_key( &self, ) -> Option<&HashMap> { if self.bodies_by_key.is_empty() { None } else { Some(&self.bodies_by_key) } } /// Count of cross-file bodies currently loaded. Exposed for /// `tracing::debug!` observability, lets callers distinguish "no /// bodies available" from "bodies available but inline didn't fire". pub fn bodies_len(&self) -> usize { self.bodies_by_key.len() } /// Resolve a bare callee name to a cross-file body. /// /// Uses `resolve_callee_key()` for strict deterministic resolution, /// then checks `bodies_by_key`. Returns `None` on `Ambiguous` or `NotFound`. pub fn resolve_callee_body( &self, lang: Lang, name: &str, arity_hint: Option, caller_namespace: &str, ) -> Option<&crate::taint::ssa_transfer::CalleeSsaBody> { match self.resolve_callee_key(name, lang, caller_namespace, arity_hint) { CalleeResolution::Resolved(key) => self.bodies_by_key.get(&key), CalleeResolution::NotFound | CalleeResolution::Ambiguous(_) => None, } } #[allow(dead_code)] // used by tests and future call-graph consumers pub fn is_empty(&self) -> bool { self.by_key.is_empty() && self.ssa_by_key.is_empty() && self.ssa_by_lang_ns_name.is_empty() && self.auth_by_key.is_empty() && self.router_facts_by_module.is_empty() && self.cross_package_imports_by_namespace.is_empty() } /// Iterate over all (key, summary) pairs. pub fn iter(&self) -> impl Iterator { self.by_key.iter() } /// Snapshot the convergence-relevant fields of every summary. /// /// Returns `(source_caps, sanitizer_caps, sink_caps, propagating_params)` /// per key. Used by the SCC fixed-point loop to detect when an iteration /// has not changed any summary, i.e. convergence. pub fn snapshot_caps(&self) -> HashMap)> { self.by_key .iter() .map(|(k, s)| { ( k.clone(), ( s.source_caps, s.sanitizer_caps, s.sink_caps, s.propagating_params.clone(), ), ) }) .collect() } /// Snapshot the SSA summaries for convergence detection. /// /// Used alongside [`Self::snapshot_caps`] in the SCC fixed-point loop so that /// SSA-only refinements (e.g. a `StripBits` transform appearing after a /// cross-file sanitizer is resolved) are not invisible to convergence. pub fn snapshot_ssa(&self) -> &HashMap { &self.ssa_by_key } /// Rust-only resolution that consults the caller's `use` map before /// falling back to generic resolution. /// /// The caller passes the callee's leaf name plus the (optional) /// structured qualifier that `CalleeSite.qualifier` carries for Rust /// call sites (e.g. `"crate::auth::token"` for `crate::auth::token::validate()`). /// The `use` map and wildcard list come from the caller's own /// [`FuncSummary`]. /// /// Resolution order: /// /// 1. If the caller has a `use_map` and (qualifier, name) resolves to a /// fully qualified path, strip the leading `crate::` and look up /// `(module_path, name)` in the Rust module index. If arity filtering /// leaves exactly one candidate → resolved. /// 2. Otherwise, for each wildcard prefix in scope, try /// `(wildcard_prefix, name)` in the module index. If across all /// wildcards exactly one arity-filtered candidate appears → resolved. /// 3. Otherwise fall through to [`Self::resolve_callee_key_with_container`] /// with no `container_hint`, meaning only the existing namespace / /// arity disambiguation applies. /// /// A `None` use_map (non-Rust file or no `use` declarations) makes this /// equivalent to the generic path. pub fn resolve_callee_key_rust( &self, callee: &str, qualifier: Option<&str>, arity_hint: Option, caller_namespace: &str, use_map: Option<&crate::rust_resolve::RustUseMap>, ) -> CalleeResolution { use crate::rust_resolve::{resolve_with_use_map, split_module_and_name}; // 1) Try direct use-map resolution. if let Some(um) = use_map && let Some(full) = resolve_with_use_map(um, qualifier, callee) { let (module_path, name) = split_module_and_name(&full); if !module_path.is_empty() { let candidates = self.lookup_rust_module(&module_path, &name); let filtered: Vec<&FuncKey> = match arity_hint { Some(a) => candidates .iter() .filter(|(k, _)| k.arity == Some(a)) .map(|(k, _)| *k) .collect(), None => candidates.iter().map(|(k, _)| *k).collect(), }; if filtered.len() == 1 { return CalleeResolution::Resolved(filtered[0].clone()); } } } // 2) Try wildcards. Each wildcard expands `use prefix::*;` into an // implicit `(prefix, name)` candidate set; we union across all // wildcards and only resolve when exactly one matches under the // arity filter. if let Some(um) = use_map && !um.wildcards.is_empty() { let mut collected: Vec = Vec::new(); for w in &um.wildcards { let prefix = w.strip_prefix("crate::").unwrap_or(w); if prefix.is_empty() { continue; } for (k, _) in self.lookup_rust_module(prefix, callee) { if let Some(a) = arity_hint && k.arity != Some(a) { continue; } if !collected.contains(k) { collected.push(k.clone()); } } } if collected.len() == 1 { return CalleeResolution::Resolved(collected.remove(0)); } } // 3) Fall back to generic same-language resolution. self.resolve_callee_key_with_container( callee, Lang::Rust, caller_namespace, None, arity_hint, ) } /// Resolve a bare (already-normalized) callee name to a [`FuncKey`]. /// /// Thin wrapper around [`Self::resolve_callee`] that constructs a minimal /// [`CalleeQuery`] with no qualified hints. Kept for call sites that /// only hold a string callee and an arity; prefer [`Self::resolve_callee`] /// whenever receiver / qualifier / container information is available. pub fn resolve_callee_key( &self, callee: &str, caller_lang: Lang, caller_namespace: &str, arity_hint: Option, ) -> CalleeResolution { self.resolve_callee(&CalleeQuery { name: callee, caller_lang, caller_namespace, caller_container: None, receiver_type: None, namespace_qualifier: None, receiver_var: None, arity: arity_hint, }) } /// Resolve a callee name with an optional container hint. /// /// Legacy entry point, kept so tests and older callers compile /// unchanged. `container_hint` is interpreted as a syntactic /// container qualifier (not an authoritative receiver type), so a /// miss is allowed to fall through to leaf-name lookup. New /// callers should route through [`Self::resolve_callee`] and classify /// their hint as `receiver_type` vs `namespace_qualifier` vs /// `receiver_var` so the resolver can apply the correct policy. pub fn resolve_callee_key_with_container( &self, callee: &str, caller_lang: Lang, caller_namespace: &str, container_hint: Option<&str>, arity_hint: Option, ) -> CalleeResolution { self.resolve_callee(&CalleeQuery { name: callee, caller_lang, caller_namespace, caller_container: None, receiver_type: None, namespace_qualifier: container_hint, receiver_var: None, arity: arity_hint, }) } /// Resolve a callee with full structured hints. /// /// **New resolution order** (qualified identity primary, leaf name /// fallback): /// /// 1. **Receiver-type qualified**, if `receiver_type` is set, /// consult `by_lang_qualified[{receiver_type}::{name}]` with the /// arity filter. Exactly-one → resolved; same-namespace /// tie-breaker if multiple. *Receiver types are authoritative*: /// a miss does not fall back to bare leaf lookup (that would be /// a silent reinterpretation). /// 2. **Namespace-qualifier qualified**, if `namespace_qualifier` /// is set, try the qualified index with that container. /// Non-authoritative: a miss falls through. /// 3. **Caller-self-container**, when the caller lives inside a /// container (method body), try the qualified index against the /// caller's own container. Resolves bare `foo()` self-calls /// inside a class without collapsing into an unrelated same-leaf /// definition in another file. /// 4. **Same-namespace unique leaf**, intra-file bare-leaf call: /// if the caller's namespace contains exactly one arity-matched /// candidate with this leaf, resolve to it. /// 5. **Receiver-variable tie-break**, if the same-namespace /// lookup misses but the raw call came with a receiver variable, /// try `{receiver_var}::{name}` as a last qualified attempt. /// /// 5.5. **Bare-call free-function preference**, for a truly bare /// call (no receiver type, no namespace qualifier, no receiver /// variable), if exactly one same-namespace arity-matched /// candidate has an empty container, resolve to it. A class /// method cannot be invoked with bare-call syntax from outside /// its class, so this disambiguation is safe even when same-name /// methods exist elsewhere in the file. /// 6. **Leaf-name fallback**, arity-filtered same-language lookup. /// Unique → resolved. Multiple + we had any qualified hint → /// Ambiguous (refuse to guess when a qualifier exists but /// missed). Multiple + no qualified hint → narrow by namespace, /// then container. pub fn resolve_callee(&self, q: &CalleeQuery<'_>) -> CalleeResolution { // ── Helpers ───────────────────────────────────────────────── let arity_matches = |k: &FuncKey| match q.arity { Some(a) => k.arity == Some(a), None => true, }; // Look up `{container}::{name}` and return a single arity-matched // candidate if one exists (using same-namespace to break ties). let try_qualified = |container: &str| -> Option { if container.is_empty() { return None; } let qual = format!("{container}::{}", q.name); let candidates: Vec<&FuncKey> = self .lookup_qualified(q.caller_lang, &qual) .into_iter() .map(|(k, _)| k) .filter(|k| arity_matches(k)) .collect(); match candidates.len() { 0 => None, 1 => Some(candidates[0].clone()), _ => { let same_ns: Vec<&FuncKey> = candidates .iter() .copied() .filter(|k| k.namespace == q.caller_namespace) .collect(); if same_ns.len() == 1 { Some(same_ns[0].clone()) } else { None } } } }; // ── Step 1: receiver_type (authoritative) ─────────────────── if let Some(rt) = q.receiver_type { if let Some(key) = try_qualified(rt) { return CalleeResolution::Resolved(key); } // Authoritative miss: before returning, check whether any // candidate exists at all for the leaf name. If there are // some, report Ambiguous with the leaf candidates (so the // caller knows we saw the name but refused to pick the // wrong container). If there are none, return NotFound. let bare: Vec<&FuncKey> = self .lookup_same_lang(q.caller_lang, q.name) .into_iter() .map(|(k, _)| k) .filter(|k| arity_matches(k)) .collect(); return if bare.is_empty() { CalleeResolution::NotFound } else { CalleeResolution::Ambiguous(bare.into_iter().cloned().collect()) }; } // ── Step 2: namespace_qualifier (non-authoritative) ───────── if let Some(nq) = q.namespace_qualifier && let Some(key) = try_qualified(nq) { return CalleeResolution::Resolved(key); } // ── Step 3: caller self-container ─────────────────────────── if let Some(cc) = q.caller_container && let Some(key) = try_qualified(cc) { return CalleeResolution::Resolved(key); } // ── Step 4: same-namespace unique leaf ────────────────────── let all_candidates: Vec<&FuncKey> = self .lookup_same_lang(q.caller_lang, q.name) .into_iter() .map(|(k, _)| k) .collect(); if all_candidates.is_empty() { return CalleeResolution::NotFound; } let arity_filtered: Vec<&FuncKey> = all_candidates .iter() .copied() .filter(|k| arity_matches(k)) .collect(); if arity_filtered.is_empty() { return CalleeResolution::NotFound; } let same_ns: Vec<&FuncKey> = arity_filtered .iter() .copied() .filter(|k| k.namespace == q.caller_namespace) .collect(); if same_ns.len() == 1 { return CalleeResolution::Resolved(same_ns[0].clone()); } // ── Step 5: receiver_var tie-break (soft) ─────────────────── if let Some(rv) = q.receiver_var && let Some(key) = try_qualified(rv) { return CalleeResolution::Resolved(key); } // ── Step 5.5: bare-call free-function preference ──────────── // A call with no receiver, no namespace qualifier, and no // authoritative receiver type is syntactically a free-function // invocation: a class method cannot be invoked that way from // outside its own class (intra-class self-calls were already // resolved by step 3). When the same-namespace candidate set // contains exactly one empty-container entry, it is the // unambiguous target, returning Ambiguous here would be a // silent false negative whenever a top-level helper happens to // share a name with some method elsewhere in the file. let syntactic_bare = q.receiver_type.is_none() && q.namespace_qualifier.is_none() && q.receiver_var.is_none(); if syntactic_bare { let empty_container_same_ns: Vec<&FuncKey> = same_ns .iter() .copied() .filter(|k| k.container.is_empty()) .collect(); if empty_container_same_ns.len() == 1 { return CalleeResolution::Resolved(empty_container_same_ns[0].clone()); } } // ── Step 6: leaf fallback ─────────────────────────────────── if arity_filtered.len() == 1 { return CalleeResolution::Resolved(arity_filtered[0].clone()); } // Multiple arity-matched candidates remain. When a qualified // hint was supplied but missed, refuse to guess, a silent // leaf-name pick would defeat the point of qualified-first // resolution. (`receiver_type` is handled in Step 1 and never // reaches here; `namespace_qualifier` / `caller_container` // missing their target flow through as a soft miss.) if q.has_qualified_hint() { return CalleeResolution::Ambiguous(arity_filtered.into_iter().cloned().collect()); } // No qualified hints whatsoever, tolerate namespace narrowing. match same_ns.len() { 1 => CalleeResolution::Resolved(same_ns[0].clone()), 0 => CalleeResolution::Ambiguous(arity_filtered.into_iter().cloned().collect()), _ => CalleeResolution::Ambiguous(same_ns.into_iter().cloned().collect()), } } /// Install / refresh the type-hierarchy index from the currently /// loaded summaries. Idempotent, calling twice rebuilds. /// /// Call this once after pass-1 merge (and again whenever /// summary state changes in a way that could affect virtual /// dispatch, typically: after the call-graph is rebuilt mid-fixed-point). /// `merge()` automatically invalidates so a forgotten reinstall /// degrades to today's behaviour rather than a stale lookup. pub fn install_hierarchy(&mut self) { let h = crate::callgraph::TypeHierarchyIndex::build(self); self.hierarchy = Some(h); } /// Borrow the installed hierarchy index, if any. pub fn hierarchy(&self) -> Option<&crate::callgraph::TypeHierarchyIndex> { self.hierarchy.as_ref() } /// Hard cap on hierarchy fan-out from a single call site, see /// [`Self::resolve_callee_widened`] for rationale. Public for tests /// that need to assert cap behaviour without hard-coding the value. pub const MAX_HIERARCHY_FANOUT: usize = 8; /// Resolve a call site to *every* candidate FuncKey reachable /// through type-hierarchy fan-out. This is the runtime counterpart /// of the [`crate::callgraph::TypeHierarchyIndex::resolve_with_hierarchy`] /// step that the call-graph builder applies at edge-construction time. /// /// Behaviour: /// /// * `receiver_type = None` → falls through to /// [`Self::resolve_callee`]; returns `[k]` on `Resolved`, `[]` /// otherwise. /// * `receiver_type = Some(rt)` and either no hierarchy is installed /// or `rt` has no recorded sub-types → identical fall-through; /// the hierarchy lookup is a no-op. /// * `receiver_type = Some(rt)` with sub-types `s1, s2, …` → /// union of `lookup_qualified` for `(rt, s1, s2, …)` after arity /// filtering. Result is dedup'd in insertion order /// (direct-receiver match first, then each sub-type's match). /// /// Hard cap: at most [`Self::MAX_HIERARCHY_FANOUT`] keys are /// returned. When the cap fires, the cap-hit is logged at `debug` /// and the tail impls are silently dropped, over-fanning is a /// precision-tax knob, not a soundness one. /// /// Empty result + non-empty `subs` triggers a /// secondary fall-through to [`Self::resolve_callee`] so a /// type-fact misclassification (receiver typed as a super-class /// that has no method by this name on any sub) does not silently /// regress to "no resolution at all", the leaf-name path can still /// pick up a match. This preserves the /// "subset of today's targets, never a superset" rule under /// hierarchy-aware resolution failure. pub fn resolve_callee_widened(&self, q: &CalleeQuery<'_>) -> Vec { let arity_matches = |k: &FuncKey| match q.arity { Some(a) => k.arity == Some(a), None => true, }; let single_fallback = || -> Vec { match self.resolve_callee(q) { CalleeResolution::Resolved(k) => vec![k], _ => Vec::new(), } }; // Hierarchy fan-out only fires when the call has an // authoritative receiver type AND the index is installed AND // the type has recorded sub-types. Every other case collapses // to today's resolver. let Some(rt) = q.receiver_type.filter(|s| !s.is_empty()) else { return single_fallback(); }; let Some(h) = self.hierarchy.as_ref() else { return single_fallback(); }; let subs = h.subs_of(q.caller_lang, rt); if subs.is_empty() { return single_fallback(); } // Union direct + sub-type matches in insertion order. Dedup is // O(n²) over the cap (n ≤ 8) so a HashSet would be wasted // overhead; linear scan is faster and order-preserving. let mut out: Vec = Vec::new(); let push_unique = |out: &mut Vec, k: FuncKey| -> bool { if !out.iter().any(|e| e == &k) { out.push(k); true } else { false } }; let qualified_lookup = |container: &str| -> Vec { let qual = format!("{container}::{}", q.name); self.lookup_qualified(q.caller_lang, &qual) .into_iter() .map(|(k, _)| k.clone()) .filter(|k| arity_matches(k)) .collect() }; for k in qualified_lookup(rt) { push_unique(&mut out, k); if out.len() >= Self::MAX_HIERARCHY_FANOUT { tracing::debug!( receiver = rt, method = q.name, cap = Self::MAX_HIERARCHY_FANOUT, "hierarchy fan-out cap reached on direct receiver match" ); return out; } } for sub in subs { for k in qualified_lookup(sub.as_str()) { push_unique(&mut out, k); if out.len() >= Self::MAX_HIERARCHY_FANOUT { tracing::debug!( receiver = rt, method = q.name, cap = Self::MAX_HIERARCHY_FANOUT, "hierarchy fan-out cap reached; tail impls dropped" ); return out; } } } if out.is_empty() { // Hierarchy widening produced nothing (e.g., none of the // recorded sub-types declare this method). Fall back to // today's qualified-first resolver so the misclassified- // type case still finds a leaf match, the same // "preserve today's behaviour on miss" rule the call-graph // builder applies. return single_fallback(); } out } } impl std::fmt::Debug for GlobalSummaries { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("GlobalSummaries") .field("len", &self.by_key.len()) .field("ssa_len", &self.ssa_by_key.len()) .field("bodies_len", &self.bodies_by_key.len()) .field("auth_len", &self.auth_by_key.len()) .field("router_facts_len", &self.router_facts_by_module.len()) .field( "cross_package_imports_len", &self.cross_package_imports_by_namespace.len(), ) .finish() } } /// Return `true` iff two `FuncSummary`s can be safely union-merged at the /// same `FuncKey`. /// /// Only fields that a single function definition is guaranteed to agree on /// are compared. Behaviour fields (`source_caps`, `propagating_params`, /// `callees`, …) are deliberately ignored: merge is *allowed* to combine /// those. The test is symmetric. /// /// Comparison rules /// ──────────────── /// * **`param_count` / `kind` / `container`**, unconditional agreement. /// Any mismatch is a hard collision between distinct functions. /// * **`file_path`**, agree when both sides are populated. A blank path /// can come from synthetic summaries constructed in tests / interop /// configs and should not force a split. /// * **`param_names`**, agree when both sides are populated. Legacy /// summaries may persist with empty names; treating empty as "unknown" /// avoids gratuitous splits while still catching real divergence. /// * **`module_path`**, Rust-only. Agreed when both sides are `Some`. /// A missing module path on one side is legacy-compatible; two *distinct* /// `Some` values mean the two summaries belong to different crates' /// module trees. pub(crate) fn summaries_compatible(a: &FuncSummary, b: &FuncSummary) -> bool { if a.param_count != b.param_count { return false; } if a.kind != b.kind { return false; } if a.container != b.container { return false; } if !a.file_path.is_empty() && !b.file_path.is_empty() && a.file_path != b.file_path { return false; } if !a.param_names.is_empty() && !b.param_names.is_empty() && a.param_names != b.param_names { return false; } match (&a.module_path, &b.module_path) { (Some(l), Some(r)) if l != r => return false, _ => {} } true } /// Derive a deterministic synthetic disambiguator from the /// identity-relevant fields of a `FuncSummary`. /// /// The top bit is **not** set here, the caller composes the final value /// via `SYNTHETIC_DISAMBIG_BIT | (hash & !SYNTHETIC_DISAMBIG_BIT)` so that /// (a) the caller can safely bump the low bits to probe for a free slot, /// and (b) the synthetic namespace stays disjoint from byte-offset /// disambigs produced by `cfg.rs`. pub(crate) fn synthesize_disambig(summary: &FuncSummary) -> u32 { let mut h = std::collections::hash_map::DefaultHasher::new(); summary.param_count.hash(&mut h); summary.param_names.hash(&mut h); summary.container.hash(&mut h); summary.kind.hash(&mut h); summary.file_path.hash(&mut h); summary.source_caps.hash(&mut h); summary.sanitizer_caps.hash(&mut h); summary.sink_caps.hash(&mut h); summary.module_path.hash(&mut h); h.finish() as u32 } /// Return `true` iff the new `SsaFuncSummary` is consistent with the /// existing one at the same `FuncKey`. /// /// `SsaFuncSummary` carries no explicit `param_count`; we approximate /// it via the maximum parameter index referenced by either summary. /// Two summaries are compatible when neither references a parameter /// index the other cannot, an upward compatibility check, so a refined /// summary that merely adds flows for previously-silent parameters is /// still considered compatible. fn ssa_summaries_compatible( existing: &SsaFuncSummary, new: &SsaFuncSummary, key_arity: Option, ) -> bool { if !ssa_summary_fits_arity(existing, key_arity) { // Existing entry itself is inconsistent with the key; don't let // that inconsistency mask a real collision with the new entry. return false; } if !ssa_summary_fits_arity(new, key_arity) { return false; } true } /// Every parameter index referenced by `summary` must fit inside /// `key_arity` when it is known. `None` (unknown arity) accepts any /// index. fn ssa_summary_fits_arity(summary: &SsaFuncSummary, key_arity: Option) -> bool { let arity = match key_arity { Some(a) => a, None => return true, }; let refs = summary .param_to_return .iter() .map(|(i, _)| *i) .chain(summary.param_to_sink.iter().map(|(i, _)| *i)) .chain(summary.param_to_sink_param.iter().map(|(i, _, _)| *i)) .chain(summary.param_container_to_return.iter().copied()) .chain( summary .param_to_container_store .iter() .flat_map(|(a, b)| [*a, *b]), ) .chain(summary.source_to_callback.iter().map(|(i, _)| *i)) .chain(summary.abstract_transfer.iter().map(|(i, _)| *i)) .chain(summary.param_return_paths.iter().map(|(i, _)| *i)); for i in refs { if i >= arity { return false; } } // Every parameter referenced by a points-to edge must also fit the // key's arity. An overflow-flagged summary is conservative by // construction and can be kept as-is. if let Some(max) = summary.points_to.max_param_index() && (max as usize) >= arity { return false; } true } /// Derive a deterministic synthetic disambiguator for an /// `SsaFuncSummary`. Mirrors `synthesize_disambig` but restricted to /// SSA-level structural signals. fn synthesize_ssa_disambig(summary: &SsaFuncSummary) -> u32 { let mut h = std::collections::hash_map::DefaultHasher::new(); summary.param_to_return.len().hash(&mut h); summary.param_to_sink.len().hash(&mut h); summary.source_caps.bits().hash(&mut h); summary.param_to_sink_param.len().hash(&mut h); summary.param_container_to_return.len().hash(&mut h); summary.param_to_container_store.len().hash(&mut h); summary.receiver_to_sink.bits().hash(&mut h); summary.receiver_to_return.is_some().hash(&mut h); summary.return_type.is_some().hash(&mut h); summary.return_abstract.is_some().hash(&mut h); summary.source_to_callback.len().hash(&mut h); summary.abstract_transfer.len().hash(&mut h); summary.param_return_paths.len().hash(&mut h); summary.points_to.edges.len().hash(&mut h); summary.points_to.overflow.hash(&mut h); summary.points_to.returns_fresh_alloc.hash(&mut h); h.finish() as u32 } /// Merge a set of per‑file summaries into a single `GlobalSummaries` map. /// /// Merging only happens for exact `FuncKey` matches (same lang + namespace + /// name + arity). Functions with the same bare name but different languages /// or namespaces are stored separately. pub fn merge_summaries( per_file: impl IntoIterator, scan_root: Option<&str>, ) -> GlobalSummaries { let mut map = GlobalSummaries::new(); for fs in per_file { let key = fs.func_key(scan_root); map.insert(key, fs); } map } #[cfg(test)] mod tests;