nyx/src/callgraph.rs

600 lines
23 KiB
Rust
Raw Normal View History

use crate::interop::InteropEdge;
use crate::summary::{CalleeResolution, GlobalSummaries};
use crate::symbol::FuncKey;
use petgraph::graph::NodeIndex;
use petgraph::prelude::*;
use std::collections::HashMap;
// ─────────────────────────────────────────────────────────────────────────────
// Types
// ─────────────────────────────────────────────────────────────────────────────
/// Metadata attached to each call-graph edge.
#[derive(Debug, Clone)]
pub struct CallEdge {
/// The raw callee string as it appeared in source (e.g. `"env::var"`).
/// Preserved for diagnostics — **not** the normalized form used for resolution.
#[allow(dead_code)] // used for future diagnostics and path display
pub call_site: String,
}
/// A callee that could not be resolved to any known function definition.
#[derive(Debug, Clone)]
#[allow(dead_code)] // fields used for future diagnostics reporting
pub struct UnresolvedCallee {
pub caller: FuncKey,
pub callee_name: String,
}
/// A callee that matched multiple function definitions — ambiguous.
#[derive(Debug, Clone)]
#[allow(dead_code)] // fields used for future diagnostics reporting
pub struct AmbiguousCallee {
pub caller: FuncKey,
pub callee_name: String,
pub candidates: Vec<FuncKey>,
}
/// The whole-program call graph.
///
/// Nodes are [`FuncKey`]s (one per function definition across all files).
/// Edges represent call-site relationships resolved after pass 1.
pub struct CallGraph {
pub graph: DiGraph<FuncKey, CallEdge>,
/// `FuncKey → NodeIndex` for quick lookup.
#[allow(dead_code)] // used for future topo-ordered analysis and call-graph queries
pub index: HashMap<FuncKey, NodeIndex>,
/// Callee strings that could not be resolved to any [`FuncKey`].
pub unresolved_not_found: Vec<UnresolvedCallee>,
/// Callee strings that matched multiple candidates.
pub unresolved_ambiguous: Vec<AmbiguousCallee>,
}
/// Result of SCC / topological analysis on the call graph.
pub struct CallGraphAnalysis {
/// Strongly connected components.
pub sccs: Vec<Vec<NodeIndex>>,
/// Maps each `NodeIndex` to its SCC index in [`sccs`].
#[allow(dead_code)] // used for future topo-ordered taint propagation
pub node_to_scc: HashMap<NodeIndex, usize>,
/// SCC indices in **callee-first** (leaves-first) order.
///
/// Functions with no callees appear first; callers appear later.
/// Suitable for bottom-up taint propagation.
#[allow(dead_code)] // used for future topo-ordered taint propagation
pub topo_scc_callee_first: Vec<usize>,
}
// ─────────────────────────────────────────────────────────────────────────────
// Callee-name normalization
// ─────────────────────────────────────────────────────────────────────────────
/// Extract the last segment of a qualified callee name for resolution.
///
/// ```text
/// "env::var" → "var"
/// "std::process::Command" → "Command"
/// "obj.method" → "method"
/// "pkg.mod.func" → "func"
/// "foo" → "foo" (unchanged)
/// "" → "" (edge case)
/// ```
///
/// The original raw text is preserved on [`CallEdge::call_site`] for
/// diagnostics; this function only produces the lookup key.
pub(crate) fn normalize_callee_name(raw: &str) -> &str {
// Split on "::" first (Rust-style qualification), take last segment.
let after_colons = raw.rsplit("::").next().unwrap_or(raw);
// Then split on "." (method calls, Python/JS dotted paths), take last segment.
after_colons.rsplit('.').next().unwrap_or(after_colons)
}
// ─────────────────────────────────────────────────────────────────────────────
// Call-graph construction
// ─────────────────────────────────────────────────────────────────────────────
/// Build the whole-program call graph from merged summaries.
///
/// Resolution mirrors `GlobalSummaries::resolve_callee_key`:
/// 1. Normalize callee name (last segment after `::` or `.`)
/// 2. Same-language, arity-filtered, namespace-disambiguated lookup
/// 3. Interop edges (explicit cross-language bridges)
///
/// Unresolved and ambiguous callees are recorded for diagnostics but
/// do **not** create edges.
pub fn build_call_graph(summaries: &GlobalSummaries, interop_edges: &[InteropEdge]) -> CallGraph {
let mut graph = DiGraph::new();
let mut index = HashMap::new();
// 1. Create one node per FuncKey.
for (key, _) in summaries.iter() {
let idx = graph.add_node(key.clone());
index.insert(key.clone(), idx);
}
let mut unresolved_not_found = Vec::new();
let mut unresolved_ambiguous = Vec::new();
// 2. Resolve callees and add edges.
for (caller_key, summary) in summaries.iter() {
let caller_node = index[caller_key];
for raw_callee in &summary.callees {
let normalized = normalize_callee_name(raw_callee);
match summaries.resolve_callee_key(
normalized,
caller_key.lang,
&caller_key.namespace,
None,
) {
CalleeResolution::Resolved(target_key) => {
if let Some(&target_node) = index.get(&target_key) {
graph.add_edge(
caller_node,
target_node,
CallEdge {
call_site: raw_callee.clone(),
},
);
}
}
CalleeResolution::NotFound => {
// Try interop edges before recording as not-found.
if let Some(target_key) =
resolve_via_interop(raw_callee, caller_key, interop_edges)
&& let Some(&target_node) = index.get(&target_key)
{
graph.add_edge(
caller_node,
target_node,
CallEdge {
call_site: raw_callee.clone(),
},
);
continue;
}
unresolved_not_found.push(UnresolvedCallee {
caller: caller_key.clone(),
callee_name: raw_callee.clone(),
});
}
CalleeResolution::Ambiguous(candidates) => {
unresolved_ambiguous.push(AmbiguousCallee {
caller: caller_key.clone(),
callee_name: raw_callee.clone(),
candidates,
});
}
}
}
}
CallGraph {
graph,
index,
unresolved_not_found,
unresolved_ambiguous,
}
}
/// Check interop edges for a matching cross-language bridge.
fn resolve_via_interop(
raw_callee: &str,
caller_key: &FuncKey,
interop_edges: &[InteropEdge],
) -> Option<FuncKey> {
for edge in interop_edges {
if edge.from.caller_lang == caller_key.lang
&& edge.from.caller_namespace == caller_key.namespace
&& edge.from.callee_symbol == raw_callee
&& (edge.from.caller_func.is_empty() || edge.from.caller_func == caller_key.name)
{
return Some(edge.to.clone());
}
}
None
}
// ─────────────────────────────────────────────────────────────────────────────
// SCC / topological analysis
// ─────────────────────────────────────────────────────────────────────────────
/// Compute SCC decomposition and topological ordering of the call graph.
///
/// `petgraph::algo::tarjan_scc` returns SCCs in *reverse* topological order
/// of the condensation DAG — i.e. leaf SCCs (no outgoing cross-SCC edges)
/// come **first**. That is exactly the **callee-first** order suitable for
/// bottom-up taint propagation.
pub fn analyse(cg: &CallGraph) -> CallGraphAnalysis {
let sccs = petgraph::algo::tarjan_scc(&cg.graph);
let mut node_to_scc = HashMap::with_capacity(cg.graph.node_count());
for (scc_idx, scc) in sccs.iter().enumerate() {
for &node in scc {
node_to_scc.insert(node, scc_idx);
}
}
// tarjan_scc already gives callee-first ordering.
let topo_scc_callee_first: Vec<usize> = (0..sccs.len()).collect();
CallGraphAnalysis {
sccs,
node_to_scc,
topo_scc_callee_first,
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Tests
// ─────────────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
use crate::interop::CallSiteKey;
use crate::summary::{FuncSummary, merge_summaries};
use crate::symbol::Lang;
/// Helper to create a minimal FuncSummary.
fn make_summary(
name: &str,
file_path: &str,
lang: &str,
param_count: usize,
callees: Vec<&str>,
) -> FuncSummary {
FuncSummary {
name: name.into(),
file_path: file_path.into(),
lang: lang.into(),
param_count,
param_names: vec![],
source_caps: 0,
sanitizer_caps: 0,
sink_caps: 0,
propagates_taint: false,
tainted_sink_params: vec![],
callees: callees.into_iter().map(String::from).collect(),
}
}
// ── normalize_callee_name ────────────────────────────────────────────
#[test]
fn normalize_callee_basic() {
assert_eq!(normalize_callee_name("env::var"), "var");
assert_eq!(normalize_callee_name("std::process::Command"), "Command");
assert_eq!(normalize_callee_name("obj.method"), "method");
assert_eq!(normalize_callee_name("pkg.mod.func"), "func");
assert_eq!(normalize_callee_name("foo"), "foo");
assert_eq!(normalize_callee_name(""), "");
}
// ── same name, different Rust modules ────────────────────────────────
#[test]
fn same_name_different_rust_modules() {
let helper_a = make_summary("helper", "src/a.rs", "rust", 0, vec![]);
let helper_b = make_summary("helper", "src/b.rs", "rust", 0, vec![]);
let caller = make_summary("caller", "src/a.rs", "rust", 0, vec!["helper"]);
let gs = merge_summaries(vec![helper_a, helper_b, caller], None);
let cg = build_call_graph(&gs, &[]);
// Two helper nodes + one caller node = 3 nodes
assert_eq!(cg.graph.node_count(), 3);
// Caller is in src/a.rs, so "helper" resolves to src/a.rs::helper
let caller_key = FuncKey {
lang: Lang::Rust,
namespace: "src/a.rs".into(),
name: "caller".into(),
arity: Some(0),
};
let helper_a_key = FuncKey {
lang: Lang::Rust,
namespace: "src/a.rs".into(),
name: "helper".into(),
arity: Some(0),
};
let caller_node = cg.index[&caller_key];
let helper_a_node = cg.index[&helper_a_key];
// Exactly one edge: caller → helper_a
let edges: Vec<_> = cg
.graph
.edges(caller_node)
.filter(|e| e.target() == helper_a_node)
.collect();
assert_eq!(edges.len(), 1);
assert!(cg.unresolved_not_found.is_empty());
assert!(cg.unresolved_ambiguous.is_empty());
}
// ── same name, Python vs Rust ────────────────────────────────────────
#[test]
fn same_name_python_and_rust() {
let py_foo = make_summary("foo", "handler.py", "python", 0, vec![]);
let rs_foo = make_summary("foo", "handler.rs", "rust", 0, vec![]);
// Python caller calls "foo" — should only see the Python one
let py_caller = make_summary("main", "app.py", "python", 0, vec!["foo"]);
let gs = merge_summaries(vec![py_foo, rs_foo, py_caller], None);
let cg = build_call_graph(&gs, &[]);
assert_eq!(cg.graph.node_count(), 3);
let py_foo_key = FuncKey {
lang: Lang::Python,
namespace: "handler.py".into(),
name: "foo".into(),
arity: Some(0),
};
let caller_key = FuncKey {
lang: Lang::Python,
namespace: "app.py".into(),
name: "main".into(),
arity: Some(0),
};
let caller_node = cg.index[&caller_key];
let py_foo_node = cg.index[&py_foo_key];
// Edge goes to Python foo, not Rust foo
let edges: Vec<_> = cg.graph.edges(caller_node).collect();
assert_eq!(edges.len(), 1);
assert_eq!(edges[0].target(), py_foo_node);
}
// ── arity differences → separate nodes ───────────────────────────────
#[test]
fn arity_differences_separate_nodes() {
let helper1 = make_summary("helper", "lib.rs", "rust", 1, vec![]);
let helper2 = make_summary("helper", "lib.rs", "rust", 2, vec![]);
let gs = merge_summaries(vec![helper1, helper2], None);
let cg = build_call_graph(&gs, &[]);
// Two separate nodes (different arity → different FuncKey)
assert_eq!(cg.graph.node_count(), 2);
let key1 = FuncKey {
lang: Lang::Rust,
namespace: "lib.rs".into(),
name: "helper".into(),
arity: Some(1),
};
let key2 = FuncKey {
lang: Lang::Rust,
namespace: "lib.rs".into(),
name: "helper".into(),
arity: Some(2),
};
assert!(cg.index.contains_key(&key1));
assert!(cg.index.contains_key(&key2));
}
// ── recursive SCC detection ──────────────────────────────────────────
#[test]
fn recursive_scc_detection() {
let a = make_summary("a", "lib.rs", "rust", 0, vec!["b"]);
let b = make_summary("b", "lib.rs", "rust", 0, vec!["a"]);
let gs = merge_summaries(vec![a, b], None);
let cg = build_call_graph(&gs, &[]);
assert_eq!(cg.graph.edge_count(), 2); // a→b and b→a
let analysis = analyse(&cg);
// Both nodes should be in the same SCC
let key_a = FuncKey {
lang: Lang::Rust,
namespace: "lib.rs".into(),
name: "a".into(),
arity: Some(0),
};
let key_b = FuncKey {
lang: Lang::Rust,
namespace: "lib.rs".into(),
name: "b".into(),
arity: Some(0),
};
let scc_a = analysis.node_to_scc[&cg.index[&key_a]];
let scc_b = analysis.node_to_scc[&cg.index[&key_b]];
assert_eq!(scc_a, scc_b);
assert_eq!(analysis.sccs[scc_a].len(), 2);
}
// ── unresolved callee → recorded as not found ────────────────────────
#[test]
fn unresolved_callee_recorded_as_not_found() {
let caller = make_summary("caller", "lib.rs", "rust", 0, vec!["nonexistent"]);
let gs = merge_summaries(vec![caller], None);
let cg = build_call_graph(&gs, &[]);
assert_eq!(cg.graph.edge_count(), 0);
assert_eq!(cg.unresolved_not_found.len(), 1);
assert_eq!(cg.unresolved_not_found[0].callee_name, "nonexistent");
assert!(cg.unresolved_ambiguous.is_empty());
}
// ── ambiguous callee → recorded as ambiguous ─────────────────────────
#[test]
fn ambiguous_callee_recorded() {
// Two "helper" functions in different namespaces.
let helper_a = make_summary("helper", "a.rs", "rust", 0, vec![]);
let helper_b = make_summary("helper", "b.rs", "rust", 0, vec![]);
// Caller is in a THIRD namespace, so neither is preferred.
let caller = make_summary("caller", "c.rs", "rust", 0, vec!["helper"]);
let gs = merge_summaries(vec![helper_a, helper_b, caller], None);
let cg = build_call_graph(&gs, &[]);
assert_eq!(cg.graph.edge_count(), 0); // no edge — ambiguous
assert!(cg.unresolved_not_found.is_empty());
assert_eq!(cg.unresolved_ambiguous.len(), 1);
assert_eq!(cg.unresolved_ambiguous[0].callee_name, "helper");
assert_eq!(cg.unresolved_ambiguous[0].candidates.len(), 2);
}
// ── diamond topo order (callee-first) ────────────────────────────────
#[test]
fn diamond_topo_callee_first() {
// A → B, A → C, B → D, C → D
let d = make_summary("d", "lib.rs", "rust", 0, vec![]);
let b = make_summary("b", "lib.rs", "rust", 0, vec!["d"]);
let c = make_summary("c", "lib.rs", "rust", 0, vec!["d"]);
let a = make_summary("a", "lib.rs", "rust", 0, vec!["b", "c"]);
let gs = merge_summaries(vec![a, b, c, d], None);
let cg = build_call_graph(&gs, &[]);
assert_eq!(cg.graph.node_count(), 4);
let analysis = analyse(&cg);
let key = |name: &str| FuncKey {
lang: Lang::Rust,
namespace: "lib.rs".into(),
name: name.into(),
arity: Some(0),
};
let scc_of = |name: &str| analysis.node_to_scc[&cg.index[&key(name)]];
let topo_pos = |name: &str| {
analysis
.topo_scc_callee_first
.iter()
.position(|&s| s == scc_of(name))
.unwrap()
};
// D (leaf) must come before B and C, which must come before A (root).
assert!(topo_pos("d") < topo_pos("b"));
assert!(topo_pos("d") < topo_pos("c"));
assert!(topo_pos("b") < topo_pos("a"));
assert!(topo_pos("c") < topo_pos("a"));
}
// ── interop edge resolution ──────────────────────────────────────────
#[test]
fn interop_edge_resolution() {
let py_caller = make_summary("process", "handler.py", "python", 0, vec!["js_func"]);
let js_target = make_summary("js_func", "util.js", "javascript", 1, vec![]);
let gs = merge_summaries(vec![py_caller, js_target], None);
let interop = vec![InteropEdge {
from: CallSiteKey {
caller_lang: Lang::Python,
caller_namespace: "handler.py".into(),
caller_func: String::new(), // wildcard
callee_symbol: "js_func".into(),
ordinal: 0,
},
to: FuncKey {
lang: Lang::JavaScript,
namespace: "util.js".into(),
name: "js_func".into(),
arity: Some(1),
},
arg_map: vec![],
ret_taints: false,
}];
let cg = build_call_graph(&gs, &interop);
let caller_key = FuncKey {
lang: Lang::Python,
namespace: "handler.py".into(),
name: "process".into(),
arity: Some(0),
};
let target_key = FuncKey {
lang: Lang::JavaScript,
namespace: "util.js".into(),
name: "js_func".into(),
arity: Some(1),
};
let caller_node = cg.index[&caller_key];
let target_node = cg.index[&target_key];
let edges: Vec<_> = cg
.graph
.edges(caller_node)
.filter(|e| e.target() == target_node)
.collect();
assert_eq!(edges.len(), 1);
assert!(cg.unresolved_not_found.is_empty());
}
// ── namespace normalization consistency ───────────────────────────────
#[test]
fn namespace_normalization_consistency() {
// FuncSummary::func_key with a scan root produces the same namespace
// string that would be used as caller_namespace in resolution.
let summary = FuncSummary {
name: "my_func".into(),
file_path: "/home/user/proj/src/lib.rs".into(),
lang: "rust".into(),
param_count: 0,
param_names: vec![],
source_caps: 0,
sanitizer_caps: 0,
sink_caps: 0,
propagates_taint: false,
tainted_sink_params: vec![],
callees: vec![],
};
let root = "/home/user/proj";
let key = summary.func_key(Some(root));
// The namespace in the key must be the same as what normalize_namespace produces
let expected_ns = crate::symbol::normalize_namespace(&summary.file_path, Some(root));
assert_eq!(key.namespace, expected_ns);
assert_eq!(key.namespace, "src/lib.rs");
}
// ── raw call_site preserved on edge ──────────────────────────────────
#[test]
fn raw_call_site_preserved_on_edge() {
// Callee "env::var" normalizes to "var" for resolution, but
// the edge should retain the original raw text.
let source = make_summary("var", "util.rs", "rust", 0, vec![]);
let caller = make_summary("main", "util.rs", "rust", 0, vec!["env::var"]);
let gs = merge_summaries(vec![source, caller], None);
let cg = build_call_graph(&gs, &[]);
let caller_key = FuncKey {
lang: Lang::Rust,
namespace: "util.rs".into(),
name: "main".into(),
arity: Some(0),
};
let caller_node = cg.index[&caller_key];
let edges: Vec<_> = cg.graph.edges(caller_node).collect();
assert_eq!(edges.len(), 1);
// Raw call_site preserved, not the normalized "var"
assert_eq!(edges[0].weight().call_site, "env::var");
}
}