Performance and precision pass (#64)

This commit is contained in:
Eli Peter 2026-05-04 19:58:04 -04:00 committed by GitHub
parent c7c5e0f3a1
commit fb698d2c27
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
97 changed files with 9932 additions and 517 deletions

View file

@ -21,6 +21,7 @@ pub mod ssa_summary;
use crate::labels::Cap;
use crate::summary::ssa_summary::SsaFuncSummary;
use crate::symbol::{FuncKey, FuncKind, Lang, normalize_namespace};
use rustc_hash::FxHashMap;
use serde::{Deserialize, Deserializer, Serialize};
use smallvec::SmallVec;
use std::collections::{BTreeMap, HashMap};
@ -517,15 +518,20 @@ impl<'a> CalleeQuery<'a> {
/// for same-language resolution in the taint engine.
#[derive(Default)]
pub struct GlobalSummaries {
by_key: HashMap<FuncKey, FuncSummary>,
/// FxHashMap (rustc_hash) replaces stdlib SipHash. FuncKey carries 3
/// String fields, so any HashMap operation walks ≥30 bytes through the
/// hasher; FxHash is ~5x faster than SipHash on this workload. Seed
/// is fixed (no DoS hardening), which is fine for an in-process index
/// keyed by static program-derived names.
by_key: FxHashMap<FuncKey, FuncSummary>,
/// Bare leaf-name index, kept for compatibility with callers that only
/// see an unqualified call string. A single name may map to many keys
/// across containers / files / arities.
by_lang_name: HashMap<(Lang, String), Vec<FuncKey>>,
by_lang_name: FxHashMap<(Lang, String), Vec<FuncKey>>,
/// Container-qualified index: keyed on `"{container}::{name}"` (or just
/// `name` for free functions). Used to resolve calls when the call-site
/// can supply a receiver / container hint (e.g. `OrderService::process`).
by_lang_qualified: HashMap<(Lang, String), Vec<FuncKey>>,
by_lang_qualified: FxHashMap<(Lang, String), Vec<FuncKey>>,
/// Rust-only secondary index keyed on `(module_path, name)`.
///
/// Populated whenever a Rust [`FuncSummary`] is inserted with a
@ -533,7 +539,7 @@ pub struct GlobalSummaries {
/// candidates by their crate-relative module rather than their
/// filesystem path. Same name / module / arity overloads land on the
/// same vector, arity narrowing happens at resolution time.
by_rust_module: HashMap<(String, String), Vec<FuncKey>>,
by_rust_module: FxHashMap<(String, String), Vec<FuncKey>>,
/// Precise SSA-derived per-parameter summaries, keyed by `FuncKey`.
/// These take precedence over `FuncSummary` during callee resolution.
ssa_by_key: HashMap<FuncKey, SsaFuncSummary>,
@ -546,6 +552,18 @@ pub struct GlobalSummaries {
/// pass 1 and consumed by
/// [`crate::auth_analysis::run_auth_analysis`] during pass 2.
auth_by_key: HashMap<FuncKey, crate::auth_analysis::model::AuthCheckSummary>,
/// Per-Python-file router declarations + `include_router` edges,
/// keyed by `module_id_for_storage(file_path)` (basename without
/// `.py`, or `parent_dir::__init__` for `__init__.py`). Populated
/// during pass 1 and consumed by
/// [`Self::resolve_cross_file_router_deps`] at pass 2 entry to lift
/// FastAPI router-level `dependencies=[Security(...)]` declared in a
/// parent file (`__init__.py` calling
/// `<parent>.include_router(<child>.router, ...)`) onto the bare
/// child router declared in another file — closing the airflow
/// execution-API auth-recognition gap on routes attached to bare
/// child routers.
router_facts_by_module: HashMap<String, crate::auth_analysis::router_facts::PerFileRouterFacts>,
/// Type hierarchy index for runtime virtual-dispatch fan-out.
///
/// Installed by [`Self::install_hierarchy`] after pass 1 from the
@ -856,6 +874,11 @@ impl GlobalSummaries {
for (key, auth_sum) in other.auth_by_key {
self.auth_by_key.insert(key, auth_sum);
}
// Router facts: last-writer-wins per (module_id) key. Re-analysing
// a file produces a fresh snapshot of its router declarations + edges.
for (module_id, facts) in other.router_facts_by_module {
self.router_facts_by_module.insert(module_id, facts);
}
// Hierarchy index: invalidate after a merge so the next consumer
// sees a freshly-built view that includes `other`'s edges. The
// alternative, point-merging two indexes, is racy when the
@ -991,6 +1014,80 @@ impl GlobalSummaries {
self.auth_by_key.len()
}
/// Insert a per-file `PerFileRouterFacts` snapshot. Last-writer-wins
/// per `module_id` key — re-analysing a file produces a fresh
/// snapshot of its router declarations and `include_router` edges.
pub fn insert_router_facts(
&mut self,
module_id: String,
facts: crate::auth_analysis::router_facts::PerFileRouterFacts,
) {
self.router_facts_by_module.insert(module_id, facts);
}
/// Resolve cross-file router-level deps for the file identified by
/// `child_module_id`. Walks every other file's persisted
/// `RouterIncludeEdge` list, finds edges whose `child_module_id`
/// matches, and accumulates the parent file's
/// `local_router_deps[parent_var]` against `child_var` — producing
/// a `<child_var> → Vec<(CallSite, scoped_security)>` map ready to
/// merge into the active file's
/// `AuthorizationModel.cross_file_router_deps`.
///
/// Single-hop only. Transitive lifts (`grandparent.include_router(parent);
/// parent.include_router(child)`) are not currently resolved — the
/// airflow shape that motivated this fix is single-hop, and adding
/// transitive resolution is a follow-up that would also need to
/// model the bare-identifier `outer.include_router(inner_router)`
/// case which the extractor presently skips.
///
/// Returns an empty map when `child_module_id` matches no edges or
/// when the index is empty.
pub fn resolve_cross_file_router_deps(
&self,
child_module_id: &str,
) -> HashMap<String, Vec<(crate::auth_analysis::model::CallSite, bool)>> {
let mut out: HashMap<String, Vec<(crate::auth_analysis::model::CallSite, bool)>> =
HashMap::new();
if self.router_facts_by_module.is_empty() {
return out;
}
for facts in self.router_facts_by_module.values() {
for edge in &facts.include_router_edges {
if edge.child_module_id != child_module_id {
continue;
}
// Look up the parent's deps in the SAME file's
// local_router_deps map (parent declarations and the
// include_router edge live in the same file).
let Some(parent_deps) = facts.local_router_deps.get(&edge.parent_var) else {
continue;
};
if parent_deps.is_empty() {
continue;
}
let entry = out.entry(edge.child_var.clone()).or_default();
for dep in parent_deps {
// Dedup by (callee name, scoped flag) so multiple
// parents declaring the same dep don't double-fire.
let already = entry
.iter()
.any(|(call, scoped)| call.name == dep.0.name && *scoped == dep.1);
if !already {
entry.push(dep.clone());
}
}
}
}
out
}
/// Count of files that contributed router facts to the index.
/// Exposed for `tracing::debug!` observability.
pub fn router_facts_len(&self) -> usize {
self.router_facts_by_module.len()
}
/// Insert a cross-file callee body.
///
/// See [`insert_ssa`](Self::insert_ssa) for the identity-safety rule.
@ -1050,7 +1147,10 @@ impl GlobalSummaries {
#[allow(dead_code)] // used by tests and future call-graph consumers
pub fn is_empty(&self) -> bool {
self.by_key.is_empty() && self.ssa_by_key.is_empty() && self.auth_by_key.is_empty()
self.by_key.is_empty()
&& self.ssa_by_key.is_empty()
&& self.auth_by_key.is_empty()
&& self.router_facts_by_module.is_empty()
}
/// Iterate over all (key, summary) pairs.
@ -1582,6 +1682,7 @@ impl std::fmt::Debug for GlobalSummaries {
.field("ssa_len", &self.ssa_by_key.len())
.field("bodies_len", &self.bodies_by_key.len())
.field("auth_len", &self.auth_by_key.len())
.field("router_facts_len", &self.router_facts_by_module.len())
.finish()
}
}

View file

@ -3851,6 +3851,126 @@ fn cross_file_devirt_does_not_union_unrelated_findbyids() {
assert_eq!(cache_sum.tainted_sink_params, vec![0]);
}
/// Cross-file router-dep resolution: parent `__init__.py` declares
/// `Security(...)` deps on a router and lifts them onto a child via
/// `<parent>.include_router(<child_module>.<child_var>, ...)`. The
/// resolution must produce a `<child_var> → Vec<(CallSite, scoped)>`
/// map for the child file's `module_id`, and absent edges must yield
/// empty.
#[test]
fn resolve_cross_file_router_deps_lifts_parent_security_dep_onto_child_router() {
use crate::auth_analysis::model::CallSite;
use crate::auth_analysis::router_facts::{PerFileRouterFacts, RouterIncludeEdge};
let mut gs = GlobalSummaries::new();
// Parent (__init__.py) declares scoped Security on `authenticated_router`
// and emits two include_router edges (task_instances + dag_runs).
let parent_callsite = CallSite {
name: "require_auth".into(),
args: Vec::new(),
span: (0, 0),
args_value_refs: Vec::new(),
};
let mut parent_facts = PerFileRouterFacts::default();
parent_facts.local_router_deps.insert(
"authenticated_router".into(),
vec![(parent_callsite.clone(), true)],
);
parent_facts.include_router_edges.push(RouterIncludeEdge {
parent_var: "authenticated_router".into(),
child_module_id: "task_instances".into(),
child_var: "router".into(),
});
parent_facts.include_router_edges.push(RouterIncludeEdge {
parent_var: "authenticated_router".into(),
child_module_id: "dag_runs".into(),
child_var: "router".into(),
});
gs.insert_router_facts("routes::__init__".into(), parent_facts);
// Child (task_instances.py) declares a bare router → expects to
// inherit the parent's deps via the cross-file resolution.
gs.insert_router_facts("task_instances".into(), PerFileRouterFacts::default());
// Resolve for task_instances → should get one entry under `router`
// carrying the require_auth (scoped=true) dep.
let resolved = gs.resolve_cross_file_router_deps("task_instances");
let deps = resolved.get("router").expect("router child resolved");
assert_eq!(deps.len(), 1);
assert_eq!(deps[0].0.name, "require_auth");
assert!(deps[0].1, "scoped flag preserved");
// dag_runs has the same parent → same lift.
let resolved_dag = gs.resolve_cross_file_router_deps("dag_runs");
assert_eq!(resolved_dag.get("router").map(|v| v.len()), Some(1));
// Unrelated module → no lift.
let resolved_other = gs.resolve_cross_file_router_deps("nonexistent");
assert!(resolved_other.is_empty());
}
/// Edge: parent without local deps for the named var emits nothing —
/// the resolver requires both an edge AND a non-empty parent dep list.
#[test]
fn resolve_cross_file_router_deps_skips_edges_with_no_parent_deps() {
use crate::auth_analysis::router_facts::{PerFileRouterFacts, RouterIncludeEdge};
let mut gs = GlobalSummaries::new();
let mut parent = PerFileRouterFacts::default();
parent.include_router_edges.push(RouterIncludeEdge {
parent_var: "ghost_router".into(),
child_module_id: "child".into(),
child_var: "router".into(),
});
gs.insert_router_facts("parent".into(), parent);
let resolved = gs.resolve_cross_file_router_deps("child");
assert!(resolved.is_empty());
}
/// Multiple parents declaring different deps for the same child
/// accumulate without duplication. Same dep declared twice (one
/// from each parent) must dedup by (callee.name, scoped).
#[test]
fn resolve_cross_file_router_deps_dedups_duplicate_parent_deps() {
use crate::auth_analysis::model::CallSite;
use crate::auth_analysis::router_facts::{PerFileRouterFacts, RouterIncludeEdge};
let cs = CallSite {
name: "require_auth".into(),
args: Vec::new(),
span: (0, 0),
args_value_refs: Vec::new(),
};
let mut gs = GlobalSummaries::new();
// Parent A: include_router(child.router) with `require_auth` dep.
let mut p_a = PerFileRouterFacts::default();
p_a.local_router_deps
.insert("router_a".into(), vec![(cs.clone(), true)]);
p_a.include_router_edges.push(RouterIncludeEdge {
parent_var: "router_a".into(),
child_module_id: "child".into(),
child_var: "router".into(),
});
gs.insert_router_facts("parent_a".into(), p_a);
// Parent B: SAME dep, different parent file.
let mut p_b = PerFileRouterFacts::default();
p_b.local_router_deps
.insert("router_b".into(), vec![(cs, true)]);
p_b.include_router_edges.push(RouterIncludeEdge {
parent_var: "router_b".into(),
child_module_id: "child".into(),
child_var: "router".into(),
});
gs.insert_router_facts("parent_b".into(), p_b);
let resolved = gs.resolve_cross_file_router_deps("child");
let deps = resolved.get("router").expect("router resolved");
assert_eq!(deps.len(), 1, "duplicate (callee, scoped) deduplicated");
}
// ── the analysis ────────────────────
//
// `GlobalSummaries::resolve_callee_widened` is the runtime counterpart of