diff --git a/src/cfg/cfg_tests.rs b/src/cfg/cfg_tests.rs index 4154a5c6..911a71e5 100644 --- a/src/cfg/cfg_tests.rs +++ b/src/cfg/cfg_tests.rs @@ -3958,3 +3958,42 @@ fn rhs_array_literal_elements_recognise_per_language_shapes() { // Non-array-shape node returns empty (defensive guard). assert!(run("javascript", b"const x = tainted;\n", &["identifier"]).is_empty()); } + +/// `CalleeSite.span` should carry the 1-based (line, col) of each call's +/// node span so downstream consumers (surface map, datastore/external +/// detectors) can render real coordinates instead of `line: 0`. +#[test] +fn callee_site_span_carries_line_and_column() { + // Three calls on three different lines. The leading newline puts + // line 1 at the blank line; `helper(x, y);` is on line 3, etc. + let src = b" +function outer(obj, x, y) { + helper(x, y); + obj.method(x); +} +"; + let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE); + let file_cfg = parse_to_file_cfg(src, "javascript", ts_lang); + let (_key, outer) = file_cfg + .summaries + .iter() + .find(|(k, _)| k.name == "outer") + .expect("outer summary should exist"); + + let helper_site = outer + .callees + .iter() + .find(|c| c.name == "helper") + .expect("helper call should be recorded"); + let (line, col) = helper_site.span.expect("span populated at CFG-build time"); + assert_eq!(line, 3, "helper(...) sits on the 3rd source line"); + assert!(col >= 5, "indented 4 spaces — column is 1-based and > 4"); + + let method_site = outer + .callees + .iter() + .find(|c| c.name.ends_with("method")) + .expect("method call should be recorded"); + let (mline, _) = method_site.span.expect("method span populated"); + assert_eq!(mline, 4, "obj.method(x) on line 4"); +} diff --git a/src/cfg/mod.rs b/src/cfg/mod.rs index c6f69d42..7c20df9e 100644 --- a/src/cfg/mod.rs +++ b/src/cfg/mod.rs @@ -5664,7 +5664,7 @@ pub(super) fn build_sub<'a>( for idx in fn_graph.node_indices() { let info = &fn_graph[idx]; if let Some(callee) = &info.call.callee { - let site = build_callee_site(callee, info, lang); + let site = build_callee_site(callee, info, lang, code); // Dedup by (name, arity, receiver, qualifier, ordinal). A // single function may legitimately contain multiple distinct // calls to the same callee (e.g. different ordinals or @@ -6632,7 +6632,12 @@ fn apply_gated_label_rules( /// remains the single segment immediately before the leaf (back-compat /// with the legacy heuristic). For method calls the qualifier is /// redundant with `receiver` and is left `None`. -fn build_callee_site(callee: &str, info: &NodeInfo, lang: &str) -> crate::summary::CalleeSite { +fn build_callee_site( + callee: &str, + info: &NodeInfo, + lang: &str, + code: &[u8], +) -> crate::summary::CalleeSite { use crate::summary::CalleeSite; let receiver = info.call.receiver.clone(); @@ -6661,15 +6666,39 @@ fn build_callee_site(callee: &str, info: &NodeInfo, lang: &str) -> crate::summar None }; + let span = callee_span_line_col(code, info.ast.span.0); + CalleeSite { name: callee.to_string(), arity, receiver, qualifier, ordinal: info.call.call_ordinal, + span, } } +/// Convert a byte offset into a 1-based `(line, col)` pair against `code`. +/// +/// Returns `None` only when `code` is empty (no source to resolve against); +/// out-of-range offsets are clamped to `code.len()` so a synthetic node +/// whose span overshoots the file still produces the last-line coordinate +/// rather than `None`. +fn callee_span_line_col(code: &[u8], offset: usize) -> Option<(u32, u32)> { + if code.is_empty() { + return None; + } + let clamped = offset.min(code.len()); + let prefix = &code[..clamped]; + let line = prefix.iter().filter(|&&b| b == b'\n').count() as u32 + 1; + let col_bytes = match prefix.iter().rposition(|&b| b == b'\n') { + Some(idx) => clamped - idx - 1, + None => clamped, + } as u32 + + 1; + Some((line, col_bytes)) +} + /// Convert the graph‑local `FuncSummaries` into serialisable [`FuncSummary`] /// values suitable for cross‑file persistence. pub(crate) fn export_summaries( diff --git a/src/dynamic/policy.rs b/src/dynamic/policy.rs index 6432baea..a406f98a 100644 --- a/src/dynamic/policy.rs +++ b/src/dynamic/policy.rs @@ -277,6 +277,12 @@ pub enum PolicyDecision { /// Stable rule identifier — one of [`DenyRule::CREDENTIALS`], /// [`DenyRule::PRIVATE_KEY`], [`DenyRule::PRODUCTION_ENDPOINT`]. rule: &'static str, + /// Logical name of the diag field that produced the matched text + /// (e.g. `path`, `message`, `evidence.notes[2]`, + /// `flow_steps[1].snippet`). Lets operators triage *where* the + /// rule fired without having to re-derive the match from the + /// scrubbed excerpt alone. + field: String, /// Short text excerpt (max 120 chars, scrubbed via /// [`Scrubber::scrub_string`]) of the offending field so an /// operator can identify *why* the deny fired without having to @@ -377,10 +383,11 @@ const PROD_ENDPOINT_REGEXES: &[&str] = &[ /// the leak shape. pub fn evaluate(diag: &crate::commands::scan::Diag) -> PolicyDecision { let texts = collect_diag_texts(diag); - for text in &texts { + for (field, text) in &texts { if let Some(hit) = match_text(text) { return PolicyDecision::Deny { rule: hit.0, + field: field.clone(), excerpt: excerpt_with_scrubber(hit.1), }; } @@ -388,46 +395,56 @@ pub fn evaluate(diag: &crate::commands::scan::Diag) -> PolicyDecision { PolicyDecision::Allow } -fn collect_diag_texts(diag: &crate::commands::scan::Diag) -> Vec { - let mut out: Vec = Vec::new(); +/// Collect every text fragment from `diag` paired with a stable name for +/// the source field. The returned field names are intentionally +/// human-readable (e.g. `evidence.notes[2]`, `flow_steps[1].snippet`) +/// rather than enum variants so they read identically in audit logs and +/// in `Display` output. +fn collect_diag_texts(diag: &crate::commands::scan::Diag) -> Vec<(String, String)> { + let mut out: Vec<(String, String)> = Vec::new(); if !diag.id.is_empty() { - out.push(diag.id.clone()); + out.push(("id".into(), diag.id.clone())); } if !diag.path.is_empty() { - out.push(diag.path.clone()); + out.push(("path".into(), diag.path.clone())); } if let Some(msg) = diag.message.as_ref() { - out.push(msg.clone()); + out.push(("message".into(), msg.clone())); } if let Some(ev) = diag.evidence.as_ref() { - for note in &ev.notes { - out.push(note.clone()); + for (i, note) in ev.notes.iter().enumerate() { + out.push((format!("evidence.notes[{i}]"), note.clone())); } if let Some(exp) = ev.explanation.as_ref() { - out.push(exp.clone()); + out.push(("evidence.explanation".into(), exp.clone())); } - for s in [&ev.source, &ev.sink] { + for (label, s) in [("source", &ev.source), ("sink", &ev.sink)] { if let Some(span) = s.as_ref() { - out.push(span.path.clone()); + out.push((format!("evidence.{label}.path"), span.path.clone())); if let Some(sn) = span.snippet.as_ref() { - out.push(sn.clone()); + out.push((format!("evidence.{label}.snippet"), sn.clone())); } } } - for span in ev.guards.iter().chain(ev.sanitizers.iter()) { + for (i, span) in ev.guards.iter().enumerate() { if let Some(sn) = span.snippet.as_ref() { - out.push(sn.clone()); + out.push((format!("evidence.guards[{i}].snippet"), sn.clone())); } } - for step in &ev.flow_steps { + for (i, span) in ev.sanitizers.iter().enumerate() { + if let Some(sn) = span.snippet.as_ref() { + out.push((format!("evidence.sanitizers[{i}].snippet"), sn.clone())); + } + } + for (i, step) in ev.flow_steps.iter().enumerate() { if !step.file.is_empty() { - out.push(step.file.clone()); + out.push((format!("flow_steps[{i}].file"), step.file.clone())); } if let Some(sn) = step.snippet.as_ref() { - out.push(sn.clone()); + out.push((format!("flow_steps[{i}].snippet"), sn.clone())); } if let Some(callee) = step.callee.as_ref() { - out.push(callee.clone()); + out.push((format!("flow_steps[{i}].callee"), callee.clone())); } } } diff --git a/src/dynamic/verify.rs b/src/dynamic/verify.rs index 3c7e7b0f..65c3a3bf 100644 --- a/src/dynamic/verify.rs +++ b/src/dynamic/verify.rs @@ -410,18 +410,22 @@ pub fn verify_finding(diag: &Diag, opts: &VerifyOptions) -> VerifyResult { // The verifier returns `Inconclusive(PolicyDeniedDynamic)` so the // operator sees *why* dynamic execution was skipped without losing // the static finding from the report. - if let crate::dynamic::policy::PolicyDecision::Deny { rule, excerpt } = - crate::dynamic::policy::evaluate(diag) + if let crate::dynamic::policy::PolicyDecision::Deny { + rule, + field, + excerpt, + } = crate::dynamic::policy::evaluate(diag) { trace.record( crate::dynamic::trace::TraceStage::Verdict, - Some(format!("policy_denied rule={rule}")), + Some(format!("policy_denied rule={rule} field={field}")), ); if opts.trace_verbose { trace.print_to_stderr(); } let inconclusive_reason = InconclusiveReason::PolicyDeniedDynamic { rule: rule.to_owned(), + field: field.clone(), excerpt: excerpt.clone(), }; // Emit telemetry so the Phase 27 events log records the deny — diff --git a/src/evidence.rs b/src/evidence.rs index 682b2503..ae47374f 100644 --- a/src/evidence.rs +++ b/src/evidence.rs @@ -337,6 +337,12 @@ pub enum InconclusiveReason { /// `production-endpoint`) and an evidence excerpt for triage. PolicyDeniedDynamic { rule: String, + /// Logical name of the diag field that matched the deny rule + /// (e.g. `path`, `evidence.notes[2]`, `flow_steps[1].snippet`). + /// Empty string for verdicts loaded from older telemetry that + /// did not capture this field. + #[serde(default)] + field: String, excerpt: String, }, } @@ -399,10 +405,23 @@ impl fmt::Display for InconclusiveReason { f, "{backend} backend cannot enforce isolation for {oracle_kind} oracle" ), - Self::PolicyDeniedDynamic { rule, excerpt } => write!( - f, - "dynamic execution refused by policy rule {rule} (matched: {excerpt})" - ), + Self::PolicyDeniedDynamic { + rule, + field, + excerpt, + } => { + if field.is_empty() { + write!( + f, + "dynamic execution refused by policy rule {rule} (matched: {excerpt})" + ) + } else { + write!( + f, + "dynamic execution refused by policy rule {rule} (matched {field}: {excerpt})" + ) + } + } } } } diff --git a/src/server/debug.rs b/src/server/debug.rs index c118fcb5..ac5b9cb6 100644 --- a/src/server/debug.rs +++ b/src/server/debug.rs @@ -809,6 +809,8 @@ pub struct CalleeSiteView { pub qualifier: Option, #[serde(skip_serializing_if = "is_zero_u32")] pub ordinal: u32, + #[serde(skip_serializing_if = "Option::is_none")] + pub span: Option<(u32, u32)>, } fn is_zero_u32(n: &u32) -> bool { @@ -884,6 +886,7 @@ impl FuncSummaryView { receiver: c.receiver.clone(), qualifier: c.qualifier.clone(), ordinal: c.ordinal, + span: c.span, }) .collect(), ssa_summary: ssa_view, diff --git a/src/summary/mod.rs b/src/summary/mod.rs index 0c89c46d..c4f008f3 100644 --- a/src/summary/mod.rs +++ b/src/summary/mod.rs @@ -191,6 +191,11 @@ const SYNTHETIC_DISAMBIG_BIT: u32 = 0x8000_0000; /// * `ordinal`, the per-function call ordinal matching /// `CallMeta.call_ordinal`, allowing cross-file consumers to address a /// specific call site rather than just a callee name. +/// * `span`, optional 1-based `(line, col)` source coordinate of the call +/// expression, populated at CFG-build time when source bytes are +/// available. `None` for legacy summaries loaded from SQLite that +/// pre-date the span field, and for synthetic test fixtures that build +/// `CalleeSite` values directly. #[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Hash)] pub struct CalleeSite { pub name: String, @@ -202,6 +207,8 @@ pub struct CalleeSite { pub qualifier: Option, #[serde(default, skip_serializing_if = "is_zero_u32")] pub ordinal: u32, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub span: Option<(u32, u32)>, } fn is_zero_u32(n: &u32) -> bool { diff --git a/src/summary/tests.rs b/src/summary/tests.rs index d33d7e84..e4e943f9 100644 --- a/src/summary/tests.rs +++ b/src/summary/tests.rs @@ -1791,6 +1791,7 @@ fn callee_site_structured_roundtrip() { receiver: Some("obj".into()), qualifier: None, ordinal: 1, + ..Default::default() }, CalleeSite { name: "env::var".into(), @@ -1798,6 +1799,7 @@ fn callee_site_structured_roundtrip() { receiver: None, qualifier: Some("env".into()), ordinal: 2, + ..Default::default() }, ], ..Default::default() diff --git a/src/surface/datastore.rs b/src/surface/datastore.rs index 7675db4b..7a72f050 100644 --- a/src/surface/datastore.rs +++ b/src/surface/datastore.rs @@ -13,7 +13,7 @@ //! that fires on its own. use super::{DataStore, DataStoreKind, SourceLocation, SurfaceNode}; -use crate::summary::{FuncSummary, GlobalSummaries}; +use crate::summary::{CalleeSite, FuncSummary, GlobalSummaries}; /// One detection rule: leaf-name pattern → store kind + label. Stored /// as a flat list so adding a new ORM / driver is a one-line edit. @@ -108,7 +108,7 @@ pub fn detect_data_stores(summaries: &GlobalSummaries) -> Vec { let Some(rule) = match_rule(&callee.name) else { continue; }; - let location = call_site_location(summary, callee.ordinal); + let location = call_site_location(summary, callee); let dedup = ( location.file.clone(), location.line, @@ -148,22 +148,23 @@ fn match_rule(callee: &str) -> Option<&'static DriverRule> { }) } -/// Best-effort source location for a call site. We only have file + -/// (sometimes) sink-attribution metadata on `FuncSummary`, so the -/// location falls back to the function's file with line 0 when no -/// finer-grained data is available. -fn call_site_location(summary: &FuncSummary, _ordinal: u32) -> SourceLocation { +/// Source location of a call site. Reads the 1-based `(line, col)` +/// recorded on the [`CalleeSite`] at CFG-build time (populated for every +/// summary produced after the span field landed); for legacy summaries +/// loaded from SQLite with no span, falls back to the function's host +/// file with line 0. +fn call_site_location(summary: &FuncSummary, callee: &CalleeSite) -> SourceLocation { + let (line, col) = callee.span.unwrap_or((0, 0)); SourceLocation { file: summary.file_path.clone(), - line: 0, - col: 0, + line, + col, } } #[cfg(test)] mod tests { use super::*; - use crate::summary::CalleeSite; use crate::symbol::{FuncKey, Lang}; fn summary_with_callees(name: &str, file: &str, callees: &[&str]) -> (FuncKey, FuncSummary) { @@ -182,6 +183,33 @@ mod tests { (key, summary) } + #[test] + fn datastore_carries_callee_span_when_present() { + // When the CFG populates `CalleeSite.span`, the detected datastore + // node's `SourceLocation` must reflect that 1-based `(line, col)` + // — not the legacy `(0, 0)` fallback. + let mut gs = GlobalSummaries::new(); + let key = FuncKey::new_function(Lang::Python, "app.py", "init", None); + let mut callee = CalleeSite::bare("psycopg2.connect"); + callee.span = Some((42, 13)); + let summary = FuncSummary { + name: "init".into(), + file_path: "app.py".into(), + lang: "python".into(), + param_count: 0, + callees: vec![callee], + ..Default::default() + }; + gs.insert(key, summary); + let nodes = detect_data_stores(&gs); + assert_eq!(nodes.len(), 1); + let SurfaceNode::DataStore(ds) = &nodes[0] else { + panic!() + }; + assert_eq!(ds.location.line, 42); + assert_eq!(ds.location.col, 13); + } + #[test] fn detects_psycopg2_connect() { let mut gs = GlobalSummaries::new(); diff --git a/src/surface/external.rs b/src/surface/external.rs index 6700c108..1bba2fbc 100644 --- a/src/surface/external.rs +++ b/src/surface/external.rs @@ -9,7 +9,7 @@ use super::{ExternalService, ExternalServiceKind, SourceLocation, SurfaceNode}; use crate::labels::Cap; -use crate::summary::{FuncSummary, GlobalSummaries}; +use crate::summary::{CalleeSite, FuncSummary, GlobalSummaries}; struct ClientRule { leaf: &'static str, @@ -87,7 +87,7 @@ pub fn detect_external_services(summaries: &GlobalSummaries) -> Vec let Some(rule) = match_rule(&callee.name) else { continue; }; - let location = call_site_location(summary); + let location = call_site_location(summary, Some(callee)); if !seen.insert((location.file.clone(), rule.label.to_string())) { continue; } @@ -104,7 +104,7 @@ pub fn detect_external_services(summaries: &GlobalSummaries) -> Vec // file as the location and synthesise a generic label. for (_key, summary) in summaries.iter() { if summary.sink_caps().contains(Cap::SSRF) { - let loc = call_site_location(summary); + let loc = call_site_location(summary, None); let dedup = (loc.file.clone(), "Outbound HTTP".to_string()); if seen.insert(dedup) { out.push(SurfaceNode::ExternalService(ExternalService { @@ -134,11 +134,16 @@ fn match_rule(callee: &str) -> Option<&'static ClientRule> { }) } -fn call_site_location(summary: &FuncSummary) -> SourceLocation { +/// Source location of an external-service call site. Reads the 1-based +/// `(line, col)` recorded on the [`CalleeSite`] at CFG-build time when +/// available; otherwise (sink-cap–only fallback path, or legacy summaries +/// loaded from SQLite) returns the function's host file with line 0. +fn call_site_location(summary: &FuncSummary, callee: Option<&CalleeSite>) -> SourceLocation { + let (line, col) = callee.and_then(|c| c.span).unwrap_or((0, 0)); SourceLocation { file: summary.file_path.clone(), - line: 0, - col: 0, + line, + col, } } diff --git a/tests/policy_deny.rs b/tests/policy_deny.rs index b0b656a2..5962de51 100644 --- a/tests/policy_deny.rs +++ b/tests/policy_deny.rs @@ -82,8 +82,16 @@ fn credentials_rule_fires_on_aws_key_in_flow_step_snippet() { )]; diag.evidence = Some(ev); match policy::evaluate(&diag) { - PolicyDecision::Deny { rule, excerpt } => { + PolicyDecision::Deny { + rule, + field, + excerpt, + } => { assert_eq!(rule, DenyRule::CREDENTIALS); + assert!( + field.starts_with("flow_steps[") && field.ends_with(".snippet"), + "deny must record the source field, got {field:?}" + ); assert!( !excerpt.contains("AKIAFAKETEST00000000"), "excerpt must scrub the raw token, got {excerpt:?}" @@ -209,8 +217,16 @@ fn verify_finding_short_circuits_without_sandbox() { .inconclusive_reason .expect("PolicyDeniedDynamic must populate inconclusive_reason"); match reason { - InconclusiveReason::PolicyDeniedDynamic { rule, excerpt } => { + InconclusiveReason::PolicyDeniedDynamic { + rule, + field, + excerpt, + } => { assert_eq!(rule, DenyRule::CREDENTIALS); + assert!( + field.starts_with("evidence.notes["), + "deny must record the source field, got {field:?}" + ); assert!( !excerpt.contains("hunter2-supersecret-test"), "excerpt must scrub the raw secret, got {excerpt:?}"