diff --git a/src/callgraph.rs b/src/callgraph.rs index 68ff2a97..a179dfd3 100644 --- a/src/callgraph.rs +++ b/src/callgraph.rs @@ -863,6 +863,100 @@ pub fn callers_of(cg: &CallGraph, callee: &FuncKey) -> Vec { .collect() } +/// Reverse-edge BFS: return every [`FuncKey`] that *transitively* calls +/// `callee`, i.e. the union of [`callers_of`] applied recursively until +/// the reverse frontier is exhausted. +/// +/// Used by the chain composer to widen file-scoped reach: a sink inside +/// `internal_helper.py` whose enclosing function is reached only through +/// `routes.py` is *reachable* in the chain sense, but the file-local +/// match in [`crate::chain::edges::locate_reach`] / [`crate::chain::search::compose_chain`] +/// misses it. This helper produces the closure once so callers can +/// resolve reach in O(1) afterwards. +/// +/// Excludes `callee` itself from the returned set, matching the +/// "strictly upstream" semantics callers want. Empty when `callee` is +/// unknown to the graph. +/// +/// Cost: O(V + E) BFS from `callee`'s reverse frontier; bounded by the +/// connected component size. +pub fn callers_transitive(cg: &CallGraph, callee: &FuncKey) -> std::collections::HashSet { + let mut seen: std::collections::HashSet = std::collections::HashSet::new(); + let Some(&start) = cg.index.get(callee) else { + return seen; + }; + let mut frontier: Vec = cg + .graph + .neighbors_directed(start, petgraph::Direction::Incoming) + .collect(); + while let Some(node) = frontier.pop() { + let key = cg.graph[node].clone(); + if !seen.insert(key) { + continue; + } + for next in cg + .graph + .neighbors_directed(node, petgraph::Direction::Incoming) + { + if !seen.contains(&cg.graph[next]) { + frontier.push(next); + } + } + } + seen +} + +/// File-level transitive reach map built from a [`CallGraph`]. +/// +/// For each `namespace` (file path) in the graph, records every other +/// namespace that contains at least one transitive caller. Built once +/// per scan so the chain composer can widen a finding's +/// `Reach::Reachable` decision beyond the file-local heuristic in +/// [`crate::chain::edges::locate_reach`] without re-running BFS per +/// finding. +/// +/// Map shape: `callee_namespace → { caller_namespace, … }`. A file +/// always appears in its own caller set so intra-file recursion stays +/// reachable. +#[derive(Debug, Default, Clone)] +pub struct FileReachMap { + by_callee_ns: HashMap>, +} + +impl FileReachMap { + /// Build the map from every function's reverse transitive closure. + /// + /// O(V × (V + E)) worst case, but the per-function BFS is sparse on + /// real call graphs (median in-degree < 4 on the eval corpus). + pub fn build(cg: &CallGraph) -> Self { + let mut by_callee_ns: HashMap> = HashMap::new(); + for callee in cg.index.keys() { + let entry = by_callee_ns.entry(callee.namespace.clone()).or_default(); + entry.insert(callee.namespace.clone()); + for caller in callers_transitive(cg, callee) { + entry.insert(caller.namespace); + } + } + FileReachMap { by_callee_ns } + } + + /// True when `caller_ns` transitively reaches at least one function + /// defined in `callee_ns`. False when either namespace is unknown + /// to the graph (conservative: chain composer falls back to the + /// file-local heuristic). + pub fn reaches(&self, caller_ns: &str, callee_ns: &str) -> bool { + self.by_callee_ns + .get(callee_ns) + .is_some_and(|set| set.contains(caller_ns)) + } + + /// Number of distinct callee namespaces tracked. Exposed for + /// diagnostics / tests. + pub fn callee_ns_len(&self) -> usize { + self.by_callee_ns.len() + } +} + /// Compute the set of file namespaces that must be re-analysed when a /// given set of callee [`FuncKey`]s have had their summaries refined. /// @@ -2799,4 +2893,73 @@ mod tests { assert!(cg.unresolved_not_found.is_empty()); assert!(cg.unresolved_ambiguous.is_empty()); } + + // ── callers_transitive + FileReachMap ─────────────────────────────── + + /// Three-hop chain across three files: + /// `routes.py::handle -> service.py::process -> helper.py::sink` + /// `callers_transitive(sink)` must return both `process` and `handle`. + /// `FileReachMap` must record `routes.py` and `service.py` as callers + /// of `helper.py`. + #[test] + fn callers_transitive_walks_multi_hop_chain() { + let handle = make_summary("handle", "routes.py", "python", 0, vec!["process"]); + let process = make_summary("process", "service.py", "python", 0, vec!["sink"]); + let sink = make_summary("sink", "helper.py", "python", 0, vec![]); + let gs = merge_summaries(vec![handle, process, sink], None); + let cg = build_call_graph(&gs, &[]); + + let sink_key = FuncKey { + lang: Lang::Python, + namespace: "helper.py".into(), + name: "sink".into(), + arity: Some(0), + ..Default::default() + }; + let transitive = callers_transitive(&cg, &sink_key); + let caller_names: std::collections::HashSet = + transitive.iter().map(|k| k.name.clone()).collect(); + assert!(caller_names.contains("process"), "process should reach sink"); + assert!(caller_names.contains("handle"), "handle should reach sink"); + assert_eq!(transitive.len(), 2, "sink itself must be excluded"); + + let reach = FileReachMap::build(&cg); + assert!(reach.reaches("routes.py", "helper.py")); + assert!(reach.reaches("service.py", "helper.py")); + assert!(reach.reaches("helper.py", "helper.py"), "self-reach"); + assert!(!reach.reaches("helper.py", "routes.py")); + } + + #[test] + fn callers_transitive_empty_for_unknown_key() { + let leaf = make_summary("leaf", "a.py", "python", 0, vec![]); + let gs = merge_summaries(vec![leaf], None); + let cg = build_call_graph(&gs, &[]); + let ghost = FuncKey { + lang: Lang::Python, + namespace: "nowhere.py".into(), + name: "ghost".into(), + arity: Some(0), + ..Default::default() + }; + assert!(callers_transitive(&cg, &ghost).is_empty()); + } + + #[test] + fn file_reach_map_handles_disconnected_components() { + let a_caller = make_summary("a_caller", "a.py", "python", 0, vec!["a_sink"]); + let a_sink = make_summary("a_sink", "a.py", "python", 0, vec![]); + let b_caller = make_summary("b_caller", "b.py", "python", 0, vec!["b_sink"]); + let b_sink = make_summary("b_sink", "b.py", "python", 0, vec![]); + let gs = merge_summaries(vec![a_caller, a_sink, b_caller, b_sink], None); + let cg = build_call_graph(&gs, &[]); + let reach = FileReachMap::build(&cg); + + assert!(reach.reaches("a.py", "a.py")); + assert!(reach.reaches("b.py", "b.py")); + // Disconnected: a.py does not reach b.py. + assert!(!reach.reaches("a.py", "b.py")); + assert!(!reach.reaches("b.py", "a.py")); + assert_eq!(reach.callee_ns_len(), 2); + } } diff --git a/src/chain/edges.rs b/src/chain/edges.rs index aa0bbe1e..3e4e47f4 100644 --- a/src/chain/edges.rs +++ b/src/chain/edges.rs @@ -13,6 +13,7 @@ //! search or do call-graph traversal: edges are emitted at finding //! granularity and carry only the file-local reach hint. +use crate::callgraph::FileReachMap; use crate::commands::scan::Diag; use crate::entry_points::HttpMethod; use crate::labels::Cap; @@ -94,13 +95,39 @@ pub struct ChainEdge { /// The output order mirrors `findings`; the caller is responsible for /// any further canonicalisation. pub fn findings_to_edges(findings: &[Diag], surface: &SurfaceMap) -> Vec { + findings_to_edges_with_reach(findings, surface, None) +} + +/// Like [`findings_to_edges`] but optionally consults a [`FileReachMap`] +/// to widen `Reach::Reachable` beyond the file-local match. +/// +/// When `reach` is `Some`, a finding's enclosing file is also considered +/// `Reachable` whenever any [`SurfaceNode::EntryPoint`]'s +/// `handler_location.file` transitively reaches the finding's file via +/// the call graph. The first matching entry-point (surface-canonical +/// order) is used to populate the `route` / `method` / `auth_required` +/// fields. +/// +/// `reach = None` is byte-identical to the legacy [`findings_to_edges`] +/// behaviour. Path strings on both sides must use the same convention +/// (project-relative POSIX) for the widening to fire; mismatched paths +/// silently fall through to the file-local heuristic. +pub fn findings_to_edges_with_reach( + findings: &[Diag], + surface: &SurfaceMap, + reach: Option<&FileReachMap>, +) -> Vec { findings .iter() - .filter_map(|d| build_edge(d, surface)) + .filter_map(|d| build_edge(d, surface, reach)) .collect() } -fn build_edge(diag: &Diag, surface: &SurfaceMap) -> Option { +fn build_edge( + diag: &Diag, + surface: &SurfaceMap, + reach: Option<&FileReachMap>, +) -> Option { let evidence = diag.evidence.as_ref()?; if evidence.sink_caps == 0 { return None; @@ -108,7 +135,7 @@ fn build_edge(diag: &Diag, surface: &SurfaceMap) -> Option { let cap_bits = evidence.sink_caps; let primary_cap = pick_chain_cap(cap_bits)?; let location = SourceLocation::new(diag.path.clone(), diag.line as u32, diag.col as u32); - let reach = locate_reach(&location, surface); + let reach_kind = locate_reach(&location, surface, reach); let feasibility = Feasibility::for_finding(diag); let finding = FindingRef { finding_id: diag.finding_id.clone(), @@ -120,7 +147,7 @@ fn build_edge(diag: &Diag, surface: &SurfaceMap) -> Option { Some(ChainEdge { finding, primary_cap, - reach, + reach: reach_kind, feasibility, }) } @@ -164,7 +191,12 @@ pub fn pick_chain_cap(bits: u32) -> Option { lowest_cap(bits) } -fn locate_reach(loc: &SourceLocation, surface: &SurfaceMap) -> Reach { +fn locate_reach( + loc: &SourceLocation, + surface: &SurfaceMap, + reach: Option<&FileReachMap>, +) -> Reach { + // Pass 1: file-local match (legacy behaviour, always applies). for node in &surface.nodes { if let SurfaceNode::EntryPoint(ep) = node { if ep.handler_location.file == loc.file { @@ -177,6 +209,23 @@ fn locate_reach(loc: &SourceLocation, surface: &SurfaceMap) -> Reach { } } } + // Pass 2: transitive caller match via the call graph. Only fires + // when `reach` is supplied — keeps the legacy file-local behaviour + // for callers that have not yet wired the call-graph reach map. + if let Some(reach) = reach { + for node in &surface.nodes { + if let SurfaceNode::EntryPoint(ep) = node { + if reach.reaches(&ep.handler_location.file, &loc.file) { + return Reach::Reachable { + location: ep.location.clone(), + method: ep.method, + route: ep.route.clone(), + auth_required: ep.auth_required, + }; + } + } + } + } Reach::Unreachable } @@ -247,4 +296,61 @@ mod tests { assert_eq!(edges.len(), 1); assert!(matches!(edges[0].reach, Reach::Unreachable)); } + + /// Cross-file finding becomes Reachable when the call-graph reach + /// map records a transitive caller in the entry-point's file. + #[test] + fn reach_widens_with_file_reach_map() { + use crate::callgraph::{FileReachMap, build_call_graph}; + use crate::entry_points::HttpMethod; + use crate::summary::{FuncSummary, merge_summaries}; + use crate::surface::{EntryPoint, Framework, SourceLocation, SurfaceNode}; + + // routes.py::handle -> helper.py::sink + let handle = FuncSummary { + name: "handle".into(), + file_path: "routes.py".into(), + lang: "python".into(), + param_count: 0, + callees: vec![crate::summary::CalleeSite::bare("sink")], + ..Default::default() + }; + let sink = FuncSummary { + name: "sink".into(), + file_path: "helper.py".into(), + lang: "python".into(), + param_count: 0, + ..Default::default() + }; + let gs = merge_summaries(vec![handle, sink], None); + let cg = build_call_graph(&gs, &[]); + let reach = FileReachMap::build(&cg); + + let mut surface = SurfaceMap::new(); + surface.nodes.push(SurfaceNode::EntryPoint(EntryPoint { + location: SourceLocation::new("routes.py", 1, 1), + framework: Framework::Flask, + method: HttpMethod::GET, + route: "/".into(), + handler_name: "handle".into(), + handler_location: SourceLocation::new("routes.py", 2, 1), + auth_required: false, + })); + + let d = diag_with_cap("helper.py", 10, Cap::CODE_EXEC); + + // Without reach: file-local lookup leaves the finding Unreachable. + let edges = findings_to_edges(&[d.clone()], &surface); + assert!(matches!(edges[0].reach, Reach::Unreachable)); + + // With reach: transitive caller in `routes.py` lifts to Reachable. + let edges = findings_to_edges_with_reach(&[d], &surface, Some(&reach)); + match &edges[0].reach { + Reach::Reachable { route, method, .. } => { + assert_eq!(route, "/"); + assert_eq!(*method, HttpMethod::GET); + } + other => panic!("expected Reachable, got {other:?}"), + } + } } diff --git a/src/chain/mod.rs b/src/chain/mod.rs index 0e698e00..67bcd6b3 100644 --- a/src/chain/mod.rs +++ b/src/chain/mod.rs @@ -41,7 +41,7 @@ pub mod reverify; pub mod score; pub mod search; -pub use edges::{ChainEdge, FindingRef, findings_to_edges}; +pub use edges::{ChainEdge, FindingRef, findings_to_edges, findings_to_edges_with_reach}; pub use feasibility::Feasibility; pub use finding::{ChainFinding, ChainMember, ChainSeverity, ChainSink}; pub use impact::{IMPACT_LATTICE, ImpactCategory, ImpactRule, lookup_impact}; @@ -51,7 +51,7 @@ pub use reverify::{ reverify_chain_with, reverify_top_chains, reverify_top_chains_with, }; pub use score::{ChainScoreConfig, category_weight, min_score_default, score_path}; -pub use search::{ChainSearchConfig, find_chains}; +pub use search::{ChainSearchConfig, find_chains, find_chains_with_reach}; /// One node in a [`ChainGraph`]. /// diff --git a/src/chain/search.rs b/src/chain/search.rs index 870f0d62..98f08f42 100644 --- a/src/chain/search.rs +++ b/src/chain/search.rs @@ -43,6 +43,7 @@ //! adjacent when they share a source file, mirroring Phase 24's //! `findings_to_edges` reach resolver. +use crate::callgraph::FileReachMap; use crate::chain::edges::{ChainEdge, Reach}; use crate::chain::finding::{ChainFinding, ChainSink}; use crate::chain::impact::{ImpactCategory, lookup_impact}; @@ -75,6 +76,24 @@ pub fn find_chains( edges: &[ChainEdge], surface: &SurfaceMap, cfg: ChainSearchConfig, +) -> Vec { + find_chains_with_reach(edges, surface, cfg, None) +} + +/// Like [`find_chains`] but optionally consults a [`FileReachMap`] to +/// widen the per-entry-per-sink file-scope filter beyond literal +/// file-equality. +/// +/// When `reach` is `Some`, a candidate edge is in scope for a given +/// sink whenever the finding's file *or* a transitive caller of it +/// reaches the sink's file via the call graph. `reach = None` +/// preserves the legacy file-local behaviour for callers that have +/// not yet wired the call-graph reach map. +pub fn find_chains_with_reach( + edges: &[ChainEdge], + surface: &SurfaceMap, + cfg: ChainSearchConfig, + reach: Option<&FileReachMap>, ) -> Vec { if cfg.max_depth == 0 || edges.is_empty() { return Vec::new(); @@ -96,18 +115,18 @@ pub fn find_chains( .cmp(&(b.finding.stable_hash, &b.finding.rule_id, &b.finding.location)) }); for sink in &sinks { - // Phase 25 limits per-entry-per-sink search to those - // candidates that share a file with the sink. Phase 25's - // deferred call-graph follow-up will widen this. + // Scope candidates to the sink: same-file match (legacy), + // optionally widened by a call-graph-derived reach map so + // a finding in `internal_helper.py` whose enclosing + // function is reached only through `routes.py` still + // composes against a sink in `routes.py`. let scoped: Vec<&ChainEdge> = candidates .iter() .filter(|e| { - // Surface DangerousLocal location uses POSIX path; - // the per-finding location is whatever the analyser - // recorded. Match on the trailing path segment so - // a project-relative vs absolute mismatch does not - // gate the chain. paths_overlap(&e.finding.location.file, &sink.location.file) + || reach.is_some_and(|r| { + r.reaches(&e.finding.location.file, &sink.location.file) + }) }) .copied() .collect(); @@ -651,4 +670,74 @@ mod tests { let chains = find_chains(&[e], &surface, cfg); assert!(chains.is_empty()); } + + /// Sink in a different file than the finding composes only when the + /// call-graph reach map records a transitive caller relationship. + #[test] + fn cross_file_chain_requires_reach_map() { + use crate::callgraph::{FileReachMap, build_call_graph}; + use crate::summary::{FuncSummary, merge_summaries}; + + let mut surface = SurfaceMap::new(); + surface.nodes.push(entry("routes.py", "/exec", false)); + // Sink lives in a helper file the entry handler transitively + // reaches, not the entry file itself. + surface.nodes.push(sink( + "helper.py", + 20, + "os.system", + Cap::CODE_EXEC, + )); + let e = edge_with( + "routes.py", + 10, + "taint-codeexec", + Cap::CODE_EXEC, + "/exec", + HttpMethod::POST, + Feasibility::Unverified, + ); + + let cfg = ChainSearchConfig { + max_depth: 4, + min_score: 0.0, + }; + + // No reach map: routes.py finding cannot compose against + // helper.py sink because `paths_overlap` rejects the pair. + let baseline = find_chains(std::slice::from_ref(&e), &surface, cfg); + assert!( + baseline.is_empty(), + "without reach map, cross-file chain must not compose" + ); + + // Reach map: routes.py::handle calls helper.py::sink so + // helper.py is reachable from routes.py. + let handle = FuncSummary { + name: "handle".into(), + file_path: "routes.py".into(), + lang: "python".into(), + param_count: 0, + callees: vec![crate::summary::CalleeSite::bare("sink")], + ..Default::default() + }; + let sink_fn = FuncSummary { + name: "sink".into(), + file_path: "helper.py".into(), + lang: "python".into(), + param_count: 0, + ..Default::default() + }; + let gs = merge_summaries(vec![handle, sink_fn], None); + let cg = build_call_graph(&gs, &[]); + let reach = FileReachMap::build(&cg); + + let chains = find_chains_with_reach(&[e], &surface, cfg, Some(&reach)); + assert_eq!( + chains.len(), + 1, + "reach map should widen scope to include helper.py sink" + ); + assert_eq!(chains[0].implied_impact, ImpactCategory::Rce); + } } diff --git a/src/commands/scan.rs b/src/commands/scan.rs index df88eafb..108c9738 100644 --- a/src/commands/scan.rs +++ b/src/commands/scan.rs @@ -439,6 +439,13 @@ pub fn handle( // functions below. Set to true if any C / C++ file is enumerated. let preview_tier_seen = Arc::new(AtomicBool::new(false)); + // Call-graph-derived file reachability map. Populated by the inner + // observer once the call graph is built, then consumed by the chain + // composer below to widen cross-file Reach beyond the file-local + // heuristic in `findings_to_edges`. + let chain_reach_slot: std::sync::OnceLock = + std::sync::OnceLock::new(); + let (mut diags, surface_map): (Vec, crate::surface::SurfaceMap) = if index_mode == IndexMode::Off { @@ -450,6 +457,7 @@ pub fn handle( None, None, Some(&preview_tier_seen), + Some(&chain_reach_slot), )? } else { if index_mode == IndexMode::Rebuild || !db_path.exists() { @@ -484,6 +492,7 @@ pub fn handle( None, None, Some(&preview_tier_seen), + Some(&chain_reach_slot), )?; let surface_map = { let idx = Indexer::from_pool(&project_name, &pool)?; @@ -623,12 +632,25 @@ pub fn handle( }; // ── Phase 25: compose exploit chains from findings + SurfaceMap ──── - let chain_edges = crate::chain::findings_to_edges(&diags, &surface_map); + // When the inner scan populated the call-graph reach map, pass it + // to the chain layer so a finding in an internal helper whose + // enclosing function is only reached through a route handler still + // composes against a sink in the handler's file. When the slot is + // empty (legacy / AST-only paths that never built a call graph), + // the chain layer falls back to file-local reach. + let chain_reach = chain_reach_slot.get(); + let chain_edges = + crate::chain::findings_to_edges_with_reach(&diags, &surface_map, chain_reach); let chain_search_cfg = crate::chain::ChainSearchConfig { max_depth: config.chain.max_depth, min_score: config.chain.min_score, }; - let chains = crate::chain::find_chains(&chain_edges, &surface_map, chain_search_cfg); + let chains = crate::chain::find_chains_with_reach( + &chain_edges, + &surface_map, + chain_search_cfg, + chain_reach, + ); let diags_for_output = crate::output::filter_constituents( diags.clone(), &chains, @@ -1806,7 +1828,7 @@ pub(crate) fn scan_filesystem( cfg: &Config, show_progress: bool, ) -> NyxResult> { - scan_filesystem_with_observer(root, cfg, show_progress, None, None, None, None) + scan_filesystem_with_observer(root, cfg, show_progress, None, None, None, None, None) .map(|(diags, _surface_map)| diags) } @@ -1820,7 +1842,7 @@ pub(crate) fn scan_filesystem_with_surface_map( cfg: &Config, show_progress: bool, ) -> NyxResult<(Vec, crate::surface::SurfaceMap)> { - scan_filesystem_with_observer(root, cfg, show_progress, None, None, None, None) + scan_filesystem_with_observer(root, cfg, show_progress, None, None, None, None, None) } /// Walk the filesystem and perform a two-pass scan, optionally reporting @@ -1838,6 +1860,7 @@ pub(crate) fn scan_filesystem_with_observer( metrics: Option<&Arc>, logs: Option<&Arc>, preview_tier_seen: Option<&Arc>, + chain_reach_out: Option<&std::sync::OnceLock>, ) -> NyxResult<(Vec, crate::surface::SurfaceMap)> { // Ensure framework context is available (handle sets it, but direct // callers like scan_no_index may not). @@ -2177,6 +2200,10 @@ pub(crate) fn scan_filesystem_with_observer( ); } + if let Some(out) = chain_reach_out { + let _ = out.set(crate::callgraph::FileReachMap::build(&call_graph)); + } + // ── Pass 2: re-run with cross-file global summaries ────────────────── if let Some(p) = progress { p.set_stage(ScanStage::Analyzing); @@ -2326,6 +2353,7 @@ pub fn scan_with_index_parallel( None, None, None, + None, ) } @@ -2341,6 +2369,7 @@ pub fn scan_with_index_parallel_observer( metrics: Option<&Arc>, logs: Option<&Arc>, preview_tier_seen: Option<&Arc>, + chain_reach_out: Option<&std::sync::OnceLock>, ) -> NyxResult> { // Match scan_filesystem_with_observer: auto-fill framework detection when // the caller didn't supply one. Without this, directly-invoked indexed @@ -2966,6 +2995,10 @@ pub fn scan_with_index_parallel_observer( ); } + if let Some(out) = chain_reach_out { + let _ = out.set(crate::callgraph::FileReachMap::build(&call_graph)); + } + let (batches, orphans) = crate::callgraph::scc_file_batches_with_metadata( &call_graph, &cg_analysis, diff --git a/src/server/jobs.rs b/src/server/jobs.rs index 2495749c..3e1a14d8 100644 --- a/src/server/jobs.rs +++ b/src/server/jobs.rs @@ -249,6 +249,7 @@ impl JobManager { Some(&metrics), Some(&log_collector), None, + None, ) }); let elapsed = start.elapsed().as_secs_f64();