#![allow(clippy::only_used_in_recursion, clippy::type_complexity)] use crate::auth_analysis; use crate::cfg::{Cfg, FileCfg, FuncSummaries, build_cfg, export_summaries}; use crate::cfg_analysis; use crate::commands::scan::Diag; use crate::errors::{NyxError, NyxResult}; use crate::evidence::{Evidence, FlowStep, SpanEvidence, StateEvidence}; use crate::labels::{ Cap, DataLabel, LangAnalysisRules, build_lang_rules, severity_for_source_kind, }; use crate::patterns::{FindingCategory, Severity}; use crate::state; use crate::summary::ssa_summary::SsaFuncSummary; use crate::summary::{FuncSummary, GlobalSummaries}; use crate::symbol::{Lang, normalize_namespace}; use crate::taint::analyse_file; use crate::utils::config::AnalysisMode; use crate::utils::ext::lowercase_ext; use crate::utils::{Config, query_cache}; use petgraph::graph::NodeIndex; use std::borrow::Cow; use std::cell::RefCell; use std::collections::{HashMap, HashSet}; use std::ops::ControlFlow; use std::path::Path; use std::time::Instant; use tree_sitter::{Language, QueryCursor, StreamingIterator}; thread_local! { static PARSER: RefCell = RefCell::new(tree_sitter::Parser::new()); /// Records the timeout budget (in ms) when a tree-sitter parse is /// aborted due to [`parse_timeout_ms`]. Callers that want to surface /// the event as a synthetic informational [`Diag`] read this slot /// immediately after [`ParsedSource::try_new`] returns `Ok(None)` /// and clear it with `take_last_parse_timeout_ms`. static LAST_PARSE_TIMEOUT_MS: std::cell::Cell> = const { std::cell::Cell::new(None) }; } /// Consume and return the most recent parse-timeout event on this thread /// (set by [`ParsedSource::try_new`]). Used to lift the event into a /// synthetic [`Diag`] carrying an [`crate::engine_notes::EngineNote::ParseTimeout`]. pub fn take_last_parse_timeout_ms() -> Option { LAST_PARSE_TIMEOUT_MS.with(|c| c.take()) } /// Synthesize an informational diagnostic surfacing a parse-timeout event /// for `path`. The diag carries an [`crate::engine_notes::EngineNote::ParseTimeout`] /// in its evidence so downstream tooling can distinguish "found nothing" /// from "parse was aborted before we could look". fn parse_timeout_diag(path: &Path, timeout_ms: u64) -> Diag { let mut evidence = Evidence::default(); evidence.notes.push(format!( "tree-sitter parse exceeded timeout budget ({timeout_ms} ms); file skipped" )); evidence .engine_notes .push(crate::engine_notes::EngineNote::ParseTimeout { timeout_ms: timeout_ms.min(u32::MAX as u64) as u32, }); Diag { path: path.to_string_lossy().into_owned(), line: 0, col: 0, severity: Severity::Low, id: "engine.parse_timeout".into(), category: FindingCategory::Quality, path_validated: false, guard_kind: None, message: Some(format!( "tree-sitter parse exceeded timeout budget ({timeout_ms} ms); file skipped" )), labels: vec![], confidence: None, evidence: Some(evidence), rank_score: None, rank_reason: None, suppressed: false, suppression: None, rollup: None, finding_id: String::new(), alternative_finding_ids: Vec::new(), } } /// Resolve the effective parse-timeout budget in milliseconds. Tree-sitter /// is generally fast, but adversarially-crafted inputs (deeply ambiguous /// grammar constructs, pathological backtracking) can drive it into slow /// parses; the default 10 s ceiling lets a 10 000-file scan survive even if /// every file is hostile. Configured via `analysis.engine.parse_timeout_ms` /// in `nyx.conf` (or `--parse-timeout-ms` on the CLI); `0` disables the cap. fn parse_timeout_ms() -> u64 { crate::utils::analysis_options::current().parse_timeout_ms } /// Test-only: when the `NYX_TEST_FORCE_PANIC_PATH` env var is set, any file /// path containing that substring triggers a deterministic panic here. Used /// by `tests/panic_recovery_tests.rs` to exercise per-file panic behaviour in /// the scan pipeline. The env var is re-read each call so successive tests /// in the same process can toggle injection; `std::env::var` is an in-memory /// lookup on supported platforms so the overhead is negligible. fn maybe_inject_test_panic(path: &Path) { if let Ok(marker) = std::env::var("NYX_TEST_FORCE_PANIC_PATH") && !marker.is_empty() && path.to_string_lossy().contains(marker.as_str()) { panic!( "NYX_TEST_FORCE_PANIC_PATH injection: {} matches {:?}", path.display(), marker ); } } /// Convenience alias for node indices. fn byte_offset_to_point(tree: &tree_sitter::Tree, byte: usize) -> tree_sitter::Point { tree.root_node() .descendant_for_byte_range(byte, byte) .map(|n| n.start_position()) .unwrap_or_else(|| tree_sitter::Point { row: 0, column: 0 }) } /// Extract the source line containing `byte_offset`, trimmed and capped at 120 chars. fn extract_line_snippet(src: &[u8], byte_offset: usize) -> Option { if byte_offset >= src.len() { return None; } let line_start = src[..byte_offset] .iter() .rposition(|&b| b == b'\n') .map_or(0, |p| p + 1); let line_end = src[byte_offset..] .iter() .position(|&b| b == b'\n') .map_or(src.len(), |p| byte_offset + p); let line = std::str::from_utf8(&src[line_start..line_end]).ok()?; let trimmed = line.trim(); if trimmed.is_empty() { return None; } if trimmed.len() > 120 { Some(format!("{}...", &trimmed[..120])) } else { Some(trimmed.to_string()) } } /// Resolve a `file_rel` (relative to `scan_root` per /// [`normalize_namespace`] convention) back to the absolute path the /// diagnostic pipeline expects. /// /// * Empty `file_rel` — single-file scans normalize every namespace to /// `""`; treat that as "the file under analysis" and return /// `fallback.to_string_lossy()`. /// * `scan_root` absent — we have no workspace root to resolve against; /// return `file_rel` verbatim (it may already be absolute). /// * Otherwise — join `scan_root` with `file_rel`. fn resolve_file_rel(file_rel: &str, scan_root: Option<&Path>, fallback: &Path) -> String { if file_rel.is_empty() { return fallback.to_string_lossy().into_owned(); } match scan_root { Some(root) => root.join(file_rel).to_string_lossy().into_owned(), None => file_rel.to_string(), } } /// Build a [`Diag`] from a taint [`Finding`], the CFG that produced it, /// the parsed tree (for byte→line/col conversion) and the file path. fn build_taint_diag( finding: &crate::taint::Finding, cfg_graph: &crate::cfg::Cfg, tree: &tree_sitter::Tree, path: &Path, src: &[u8], scan_root: Option<&Path>, ) -> Diag { let call_site_byte = cfg_graph[finding.sink].classification_span().0; let call_site_point = byte_offset_to_point(tree, call_site_byte); // `finding.source` should be a NodeIndex valid in this body's CFG, but // cross-body / cross-file inline analysis has historically leaked // callee-NodeIndex origins (see `extract_inline_return_taint`). Guard // the lookup so a stray out-of-bounds index degrades the diagnostic // rather than panicking the worker thread. let source_info = cfg_graph.node_weight(finding.source); // The reconstructed flow path is the authoritative view of where the // taint started *in this body*. When present, prefer its first step's // CFG span over `finding.source_span` — which can be stale across // multi-hop cross-body remaps (e.g. JS two-level solve where a // callee-interior source gets its span rewritten to the enclosing // body's entry node). Fall back to `source_span`, then to the source // NodeIndex, then finally to the sink byte. let source_byte = finding .flow_steps .first() .and_then(|s| { cfg_graph .node_weight(s.cfg_node) .map(|i| i.classification_span().0) }) .or(finding.source_span) .or_else(|| source_info.map(|i| i.classification_span().0)) .unwrap_or(call_site_byte); let source_point = byte_offset_to_point(tree, source_byte); // Prefer the source CFG node's callee string when it's a call expression // (e.g. `os.getenv("X")`). For property-access sources like // `navigator.userAgent` there is no callee — fall back to the first flow // step's `variable` (the SSA var name, e.g. "userAgent"), then to the // source node's `taint.defines` / first `taint.uses` entry, before // finally giving up and rendering "(unknown)". let source_callee = source_info .and_then(|i| i.call.callee.as_deref()) .map(sanitize_desc) .or_else(|| { finding .flow_steps .first() .and_then(|s| s.var_name.as_deref()) .map(sanitize_desc) }) .or_else(|| { source_info .and_then(|i| i.taint.defines.as_deref()) .map(sanitize_desc) }) .or_else(|| { source_info .and_then(|i| i.taint.uses.first().map(String::as_str)) .map(sanitize_desc) }) .unwrap_or_else(|| "(unknown)".into()); let call_site_callee = cfg_graph[finding.sink] .call .callee .as_deref() .map(sanitize_desc) .unwrap_or_else(|| "(unknown)".into()); let kind_label = source_kind_label(finding.source_kind); let file_path_owned = path.to_string_lossy().into_owned(); // Primary-location attribution: when the sink was resolved via a // callee summary that carried a [`SinkSite`], `finding.primary_location` // names the dangerous instruction inside the callee body. Use those // coordinates as the diag's primary (file, line, col); otherwise fall // back to the caller's call-site position. let (primary_path, primary_line, primary_col, primary_snippet_hint) = if let Some(loc) = finding.primary_location.as_ref() { let abs = resolve_file_rel(&loc.file_rel, scan_root, path); if abs != file_path_owned { tracing::debug!( caller_file = %file_path_owned, primary_file = %abs, primary_line = loc.line, "taint finding attributed to a cross-file primary sink location", ); } let snippet = if loc.snippet.is_empty() { None } else { Some(loc.snippet.clone()) }; (abs, loc.line as usize, loc.col as usize, snippet) } else { ( file_path_owned.clone(), call_site_point.row + 1, call_site_point.column + 1, None, ) }; let short_source = crate::fmt::shorten_callee(&source_callee); let short_call_site = crate::fmt::shorten_callee(&call_site_callee); let sink_display = primary_snippet_hint .as_deref() .map(crate::fmt::shorten_callee) .unwrap_or_else(|| short_call_site.clone()); let sink_label_display = if finding.primary_location.is_some() { format!("{call_site_callee} \u{2192} {sink_display}") } else { call_site_callee.clone() }; let mut labels = vec![ ( "Source".into(), format!( "{source_callee} ({}:{})", source_point.row + 1, source_point.column + 1 ), ), ("Sink".into(), sink_label_display), ]; if let Some(guard) = finding.guard_kind { labels.push(("Path guard".into(), format!("{guard:?}"))); } let mut evidence_notes = Vec::new(); if finding.path_validated { evidence_notes.push("path_validated".into()); } evidence_notes.push(format!("source_kind:{:?}", finding.source_kind)); evidence_notes.push(format!("hop_count:{}", finding.hop_count)); evidence_notes.push(format!("cap_specificity:{}", finding.cap_specificity)); if finding.uses_summary { evidence_notes.push("uses_summary".into()); } // Convert raw flow steps to display FlowSteps. When the finding has a // primary_location distinct from the call site, the last raw step is // really the Call — reclassify it and append a synthetic Sink step // pointing at the callee-internal dangerous instruction so analysts // see both the call site and the final sink in the trace. let mut flow_steps: Vec = finding .flow_steps .iter() .enumerate() .map(|(i, raw)| { let step_byte = cfg_graph[raw.cfg_node].classification_span().0; let point = byte_offset_to_point(tree, step_byte); let snippet = extract_line_snippet(src, step_byte); let callee = cfg_graph[raw.cfg_node].call.callee.clone(); let function = cfg_graph[raw.cfg_node].ast.enclosing_func.clone(); FlowStep { step: (i + 1) as u32, kind: raw.op_kind.clone(), file: file_path_owned.clone(), line: (point.row + 1) as u32, col: (point.column + 1) as u32, snippet, variable: raw.var_name.clone(), callee, function, is_cross_file: false, } }) .collect(); if let Some(loc) = finding.primary_location.as_ref() { if let Some(last) = flow_steps.last_mut() && matches!(last.kind, crate::evidence::FlowStepKind::Sink) { last.kind = crate::evidence::FlowStepKind::Call; } let is_cross_file = primary_path != file_path_owned; let synthetic_snippet = if loc.snippet.is_empty() { None } else { Some(loc.snippet.clone()) }; let next_step = (flow_steps.len() + 1) as u32; flow_steps.push(FlowStep { step: next_step, kind: crate::evidence::FlowStepKind::Sink, file: primary_path.clone(), line: loc.line, col: loc.col, snippet: synthetic_snippet, variable: None, callee: None, function: None, is_cross_file, }); } let sink_evidence_snippet = primary_snippet_hint .clone() .or_else(|| Some(short_call_site.clone())); // Resolved sink capability bits — used by deduplication to distinguish // sinks with different cap types on the same source line (e.g. // `sink_sql(x); sink_shell(x);`). let sink_caps_bits: u16 = cfg_graph[finding.sink] .taint .labels .iter() .filter_map(|l| match l { crate::labels::DataLabel::Sink(c) => Some(c.bits()), _ => None, }) .fold(0u16, |acc, b| acc | b); // Phase C: when the sink's required caps include UNAUTHORIZED_ID — and // the finding actually reached that sink via the taint engine — use a // dedicated auth rule id so the finding is namespaced alongside the // standalone `auth_analysis` subsystem's output instead of being folded // into the generic `taint-unsanitised-flow` bucket. let diag_id = if sink_caps_bits & crate::labels::Cap::UNAUTHORIZED_ID.bits() != 0 { "rs.auth.missing_ownership_check.taint".to_string() } else { format!( "taint-unsanitised-flow (source {}:{})", source_point.row + 1, source_point.column + 1 ) }; let mut diag = Diag { path: primary_path.clone(), line: primary_line, col: primary_col, severity: severity_for_source_kind(finding.source_kind), id: diag_id, category: FindingCategory::Security, path_validated: finding.path_validated, guard_kind: finding.guard_kind.map(|k| format!("{k:?}")), message: Some(format!( "unsanitised {kind_label} flows from {short_source} \u{2192} {sink_display}" )), labels, confidence: None, evidence: Some(Evidence { source: Some(SpanEvidence { path: file_path_owned.clone(), line: (source_point.row + 1) as u32, col: (source_point.column + 1) as u32, kind: "source".into(), snippet: Some(short_source.clone()), }), sink: Some(SpanEvidence { path: primary_path.clone(), line: primary_line as u32, col: primary_col as u32, kind: "sink".into(), snippet: sink_evidence_snippet, }), guards: finding .guard_kind .map(|g| { vec![SpanEvidence { path: primary_path.clone(), line: primary_line as u32, col: 0, kind: "guard".into(), snippet: Some(format!("{g:?}")), }] }) .unwrap_or_default(), sanitizers: vec![], state: None, notes: evidence_notes, source_kind: Some(finding.source_kind), hop_count: Some(finding.hop_count), uses_summary: finding.uses_summary, cap_specificity: Some(finding.cap_specificity), flow_steps, symbolic: finding.symbolic.clone(), sink_caps: sink_caps_bits, engine_notes: finding.engine_notes.clone(), ..Default::default() }), rank_score: None, rank_reason: None, suppressed: false, suppression: None, rollup: None, finding_id: finding.finding_id.clone(), alternative_finding_ids: finding.alternative_finding_ids.to_vec(), }; // Post-fill explanation and confidence limiters let explanation = crate::evidence::generate_explanation(&diag); let limiters = crate::evidence::compute_confidence_limiters(&diag); if let Some(ref mut ev) = diag.evidence { ev.explanation = explanation; ev.confidence_limiters = limiters; } diag } /// Resolve a file extension to a (tree‑sitter Language, slug) pair. fn lang_for_path(path: &Path) -> Option<(Language, &'static str)> { match lowercase_ext(path) { Some("rs") => Some((Language::from(tree_sitter_rust::LANGUAGE), "rust")), Some("c") => Some((Language::from(tree_sitter_c::LANGUAGE), "c")), Some("cpp") => Some((Language::from(tree_sitter_cpp::LANGUAGE), "cpp")), Some("java") => Some((Language::from(tree_sitter_java::LANGUAGE), "java")), Some("go") => Some((Language::from(tree_sitter_go::LANGUAGE), "go")), Some("php") => Some((Language::from(tree_sitter_php::LANGUAGE_PHP), "php")), Some("py") => Some((Language::from(tree_sitter_python::LANGUAGE), "python")), Some("ts") => Some(( Language::from(tree_sitter_typescript::LANGUAGE_TYPESCRIPT), "typescript", )), // TSX grammar is a superset of TypeScript plus JSX element/attribute // nodes — all TypeScript KINDS / RULES / PARAM_CONFIG entries apply, // and JSX-specific sinks (e.g. `dangerouslySetInnerHTML`) layer on top // via the same `typescript` slug. Some("tsx") => Some(( Language::from(tree_sitter_typescript::LANGUAGE_TSX), "typescript", )), Some("js") => Some(( Language::from(tree_sitter_javascript::LANGUAGE), "javascript", )), // JSX uses the same JavaScript grammar (tree-sitter-javascript handles // JSX natively) — slug "javascript" so all JS rules apply. Some("jsx") => Some(( Language::from(tree_sitter_javascript::LANGUAGE), "javascript", )), Some("rb") => Some((Language::from(tree_sitter_ruby::LANGUAGE), "ruby")), _ => None, } } /// Fast binary-file guard: skip if >1% NUL bytes. fn is_binary(bytes: &[u8]) -> bool { bytes.iter().filter(|b| **b == 0).count() * 100 / bytes.len().max(1) > 1 } /// Check if a file path indicates a test file. Matches filename-based /// conventions (`.test.js`, `.spec.ts`) and the `__tests__` directory /// convention. Directory-only checks (`test/`, `tests/`, `fixtures/`) /// are intentionally excluded because they're too broad when scanning /// absolute paths. fn is_test_file(path: &Path) -> bool { static TEST_SUFFIXES: &[&str] = &[ ".test.js", ".test.ts", ".test.jsx", ".test.tsx", ".spec.js", ".spec.ts", ".spec.jsx", ".spec.tsx", ]; if let Some(name) = path.file_name().and_then(|n| n.to_str()) { for suffix in TEST_SUFFIXES { if name.ends_with(suffix) { return true; } } } // __tests__ is specific enough (React/Jest convention) to match on directory for component in path.components() { if let std::path::Component::Normal(c) = component && c == "__tests__" { return true; } } false } /// Pattern IDs that are noise-prone in test files (fixture credentials, /// non-crypto randomness, plain HTTP in test harnesses). fn is_test_suppressible_pattern(id: &str) -> bool { // Suffix-match to handle both js. and ts. prefixes id.ends_with(".secrets.hardcoded_secret") || id.ends_with(".crypto.math_random") || id.ends_with(".transport.fetch_http") } /// Check if a file path belongs to a non-production context (tests, vendor, /// benchmarks, etc.). Used to downgrade severity for findings in paths that /// are unlikely to represent attack surface. fn is_nonprod_path(path: &Path) -> bool { static NONPROD_DIRS: &[&str] = &[ "tests", "test", "__tests__", "benches", "benchmarks", "examples", "build", "scripts", "docs", "js_tests", "fixtures", "vendor", ]; static NONPROD_FILES: &[&str] = &["build.rs"]; if let Some(name) = path.file_name().and_then(|n| n.to_str()) && (NONPROD_FILES.contains(&name) || name.ends_with(".min.js")) { return true; } for component in path.components() { if let std::path::Component::Normal(c) = component && let Some(s) = c.to_str() && NONPROD_DIRS.contains(&s) { return true; } } false } /// Normalize a callee description for display. fn sanitize_desc(s: &str) -> String { crate::fmt::normalize_snippet(s) } /// Human-readable label for a `SourceKind`. fn source_kind_label(sk: crate::labels::SourceKind) -> &'static str { use crate::labels::SourceKind; match sk { SourceKind::UserInput => "user input", SourceKind::EnvironmentConfig => "environment config", SourceKind::FileSystem => "file system data", SourceKind::Database => "database result", SourceKind::CaughtException => "caught exception", SourceKind::Unknown => "tainted data", } } /// Downgrade severity by one tier: High→Medium, Medium→Low, Low→Low. fn downgrade_severity(s: Severity) -> Severity { match s { Severity::High => Severity::Medium, Severity::Medium => Severity::Low, Severity::Low => Severity::Low, } } // ───────────────────────────────────────────────────────────────────────────── // ParsedSource + ParsedFile: shared parse/CFG pipeline // ───────────────────────────────────────────────────────────────────────────── /// Level 1: parsed tree + lang info. No CFG construction. struct ParsedSource<'a> { tree: tree_sitter::Tree, ts_lang: Language, lang_slug: &'static str, bytes: &'a [u8], path: &'a Path, file_path_str: Cow<'a, str>, } impl<'a> ParsedSource<'a> { /// Parse bytes into a tree-sitter AST. Returns `None` for binary files, /// parse timeouts, or unsupported languages. File-size filtering is /// handled at the walker boundary via /// [`ScannerConfig::max_file_size_mb`]; the timeout check here defends /// against hostile inputs (pathological grammar ambiguities) that could /// tie up a worker indefinitely even for files within the size cap. fn try_new(bytes: &'a [u8], path: &'a Path) -> NyxResult> { // Clear any stale parse-timeout signal from a prior `try_new` on // this thread that the caller did not consume. Ensures the slot // always reflects "this parse" by the time we return. LAST_PARSE_TIMEOUT_MS.with(|c| c.set(None)); if is_binary(bytes) { return Ok(None); } let Some((ts_lang, lang_slug)) = lang_for_path(path) else { return Ok(None); }; let timeout_ms = parse_timeout_ms(); let start = Instant::now(); let mut timed_out = false; let parsed = PARSER.with(|cell| -> NyxResult> { let mut parser = cell.borrow_mut(); parser.set_language(&ts_lang)?; if timeout_ms == 0 { return Ok(parser.parse(bytes, None)); } let len = bytes.len(); let mut input = |i: usize, _pt: tree_sitter::Point| -> &[u8] { if i < len { &bytes[i..] } else { &[] } }; let mut progress = |_state: &tree_sitter::ParseState| -> ControlFlow<()> { if start.elapsed().as_millis() as u64 >= timeout_ms { timed_out = true; ControlFlow::Break(()) } else { ControlFlow::Continue(()) } }; let options = tree_sitter::ParseOptions::new().progress_callback(&mut progress); Ok(parser.parse_with_options(&mut input, None, Some(options))) })?; let Some(tree) = parsed else { if timed_out { tracing::warn!( file = %path.display(), timeout_ms, "tree-sitter parse timed out; skipping file", ); LAST_PARSE_TIMEOUT_MS.with(|c| c.set(Some(timeout_ms))); return Ok(None); } return Err(NyxError::Other("tree-sitter failed".into())); }; let file_path_str = path.to_string_lossy(); Ok(Some(Self { tree, ts_lang, lang_slug, bytes, path, file_path_str, })) } /// Run AST pattern queries and return diagnostics. fn run_ast_queries(&self, cfg: &Config) -> Vec { let root = self.tree.root_node(); let compiled = query_cache::for_lang(self.lang_slug, self.ts_lang.clone()); let mut cursor = QueryCursor::new(); let mut out = Vec::new(); let in_test_file = is_test_file(self.path); for cq in compiled.iter() { if cq.meta.severity > cfg.scanner.min_severity { continue; } // Suppress noise-prone patterns in test files if in_test_file && is_test_suppressible_pattern(cq.meta.id) { continue; } let mut matches = cursor.matches(&cq.query, root, self.bytes); while let Some(m) = matches.next() { if let Some(cap) = m.captures.iter().find(|c| c.index == 0) { // Layer A: suppress Security findings on calls with all-literal args if cq.meta.category.finding_category() == FindingCategory::Security && is_call_all_args_literal(cap.node, self.bytes) { continue; } let point = cap.node.start_position(); out.push(Diag { path: self.path.to_string_lossy().into_owned(), line: point.row + 1, col: point.column + 1, severity: cq.meta.severity, id: cq.meta.id.to_owned(), category: cq.meta.category.finding_category(), path_validated: false, guard_kind: None, message: Some(cq.meta.description.to_owned()), labels: vec![], confidence: Some(cq.meta.confidence), evidence: Some(Evidence { source: None, sink: Some(SpanEvidence { path: self.path.to_string_lossy().into_owned(), line: (point.row + 1) as u32, col: (point.column + 1) as u32, kind: "sink".into(), snippet: None, }), guards: vec![], sanitizers: vec![], state: None, notes: vec![], ..Default::default() }), rank_score: None, rank_reason: None, suppressed: false, suppression: None, rollup: None, finding_id: String::new(), alternative_finding_ids: Vec::new(), }); } } } out } /// Sort, dedup, and optionally downgrade severity for non-production paths. /// /// Dedup key matches the `issues` table PRIMARY KEY `(file_id, rule_id, /// line, col)` — severity is NOT part of the key. Two diags that agree /// on (line, col, id) but differ in severity (e.g. a pattern-rule finding /// plus a taint-pipeline finding on the same call) would otherwise survive /// dedup here and crash the indexer with a UNIQUE constraint violation. /// Sorting severity ascending (Severity::High < Medium < Low) means /// `dedup_by` keeps the first occurrence, preserving the highest severity. fn finalize_diags(&self, out: &mut Vec, cfg: &Config) { out.sort_by(|a, b| { (a.line, a.col, &a.id, a.severity).cmp(&(b.line, b.col, &b.id, b.severity)) }); out.dedup_by(|a, b| a.line == b.line && a.col == b.col && a.id == b.id); if !cfg.scanner.include_nonprod && is_nonprod_path(self.path) { for d in out.iter_mut() { d.severity = downgrade_severity(d.severity); } } } } /// Level 2: adds CFG graph, summaries, lang rules on top of ParsedSource. struct ParsedFile<'a> { source: ParsedSource<'a>, file_cfg: FileCfg, lang_rules: LangAnalysisRules, has_lang_rules: bool, } impl<'a> ParsedFile<'a> { /// Build CFG + lang rules from a parsed source. fn from_source(source: ParsedSource<'a>, cfg: &Config) -> Self { let mut lang_rules = build_lang_rules(cfg, source.lang_slug); // Single-file scans rarely have a nearby package.json, so the // project-level `FrameworkContext` misses frameworks the file // obviously imports. Augment the per-file rule set with any // framework-conditional rules keyed off in-file import specifiers // (e.g. `import fastify from 'fastify'`). Idempotent — skips // frameworks already active from the manifest pass. let in_file_fws = crate::utils::project::detect_in_file_frameworks(source.bytes, source.lang_slug); let missing: Vec<_> = in_file_fws .into_iter() .filter(|fw| !lang_rules.frameworks.contains(fw)) .collect(); if !missing.is_empty() { let aug_ctx = crate::utils::project::FrameworkContext { frameworks: missing.clone(), }; lang_rules .extra_labels .extend(crate::labels::framework_rules_for_lang_pub( source.lang_slug, &aug_ctx, )); lang_rules.frameworks.extend(missing); } let has_lang_rules = !lang_rules.extra_labels.is_empty() || !lang_rules.terminators.is_empty() || !lang_rules.event_handlers.is_empty(); let rules_ref = if has_lang_rules { Some(&lang_rules) } else { None }; let file_cfg = build_cfg( &source.tree, source.bytes, source.lang_slug, &source.file_path_str, rules_ref, ); Self { source, file_cfg, lang_rules, has_lang_rules, } } /// The top-level body's CFG graph (for backward-compatible access). fn cfg_graph(&self) -> &Cfg { &self.file_cfg.toplevel().graph } /// The top-level body's entry node. #[allow(dead_code)] fn entry(&self) -> NodeIndex { self.file_cfg.toplevel().entry } fn local_summaries(&self) -> &FuncSummaries { &self.file_cfg.summaries } fn rules_ref(&self) -> Option<&LangAnalysisRules> { if self.has_lang_rules { Some(&self.lang_rules) } else { None } } fn export_summaries(&self) -> Vec { self.export_summaries_with_root(None) } fn export_summaries_with_root(&self, scan_root: Option<&Path>) -> Vec { let mut out = export_summaries( self.local_summaries(), &self.source.file_path_str, self.source.lang_slug, ); // Rust-specific enrichment: derive the crate-relative module path for // this file and parse every top-level `use` declaration into an alias // map. The information lets the call graph resolve same-name functions // across modules and is cheap enough to compute once per file and // duplicate across the file's summaries. Non-Rust files skip all of // this and keep the new fields at `None`. if self.source.lang_slug == "rust" && !out.is_empty() { let module_path = crate::rust_resolve::derive_module_path(self.source.path, scan_root); let use_map = crate::rust_resolve::parse_rust_use_map(self.source.bytes, &self.source.tree); let aliases = if use_map.aliases.is_empty() { None } else { Some(use_map.aliases) }; let wildcards = if use_map.wildcards.is_empty() { None } else { Some(use_map.wildcards) }; for s in &mut out { s.module_path = module_path.clone(); s.rust_use_map = aliases.clone(); s.rust_wildcards = wildcards.clone(); } } out } /// Extract SSA function summaries for all functions in this file. /// Extract SSA summaries and eligible callee bodies in a single lowering pass. /// /// Returns two vectors keyed by canonical [`crate::symbol::FuncKey`]. /// The `FuncKey` identity preserves `(lang, namespace, container, name, /// arity, disambig, kind)` — so two same-name definitions in this file /// (e.g. a free `process` and a `Worker::process`, or overloads with /// different arities) land on distinct entries instead of the later one /// shadowing the earlier one. fn extract_ssa_artifacts( &self, global_summaries: Option<&GlobalSummaries>, scan_root: Option<&Path>, ) -> ( Vec<(crate::symbol::FuncKey, SsaFuncSummary)>, Vec<( crate::symbol::FuncKey, crate::taint::ssa_transfer::CalleeSsaBody, )>, ) { let caller_lang = Lang::from_slug(self.source.lang_slug).unwrap_or(Lang::Rust); let scan_root_str = scan_root.map(|p| p.to_string_lossy()); let namespace = normalize_namespace(&self.source.file_path_str, scan_root_str.as_deref()); // Use the FileCfg path (same one `analyse_file` uses at taint time) so // the SSA summaries stored cross-file match exactly what pass 2 will // resolve against — no NodeIndex-space or entry-detection drift. let locator = crate::summary::SinkSiteLocator { tree: &self.source.tree, bytes: self.source.bytes, file_rel: &namespace, }; let (summaries, bodies) = crate::taint::extract_ssa_artifacts_from_file_cfg( &self.file_cfg, caller_lang, &namespace, self.local_summaries(), global_summaries, Some(&locator), ); (summaries.into_iter().collect(), bodies) } /// Run taint analysis, CFG structural analyses, and state-model analysis. fn run_cfg_analyses( &self, cfg: &Config, global_summaries: Option<&GlobalSummaries>, scan_root: Option<&Path>, ) -> Vec { let mut out = Vec::new(); let caller_lang = Lang::from_slug(self.source.lang_slug).unwrap_or(Lang::Rust); // ── Taint analysis ────────────────────────────────────────────── tracing::debug!("Running taint analysis on: {}", self.source.path.display()); tracing::debug!("Func summaries: {:?}", self.local_summaries()); let scan_root_str = scan_root.map(|p| p.to_string_lossy()); let namespace = normalize_namespace(&self.source.file_path_str, scan_root_str.as_deref()); let extra = if self.lang_rules.extra_labels.is_empty() { None } else { Some(self.lang_rules.extra_labels.as_slice()) }; let taint_results = analyse_file( &self.file_cfg, self.local_summaries(), global_summaries, caller_lang, &namespace, &[], extra, ); // Drain the path-safe-suppressed sink-span set published by the // SSA taint engine. Used below by the state-analysis pass to // suppress `state-unauthed-access` on sinks the taint engine has // already proved cannot reach a privileged location. let path_safe_suppressed_spans = crate::taint::ssa_transfer::take_path_safe_suppressed_spans(); for finding in &taint_results { let body_cfg = &self.file_cfg.body(finding.body_id).graph; // Suppress internal redirect taint findings: res.redirect(`/path/...`) // with a path-prefix argument is server-relative, not an open redirect. let sink_info = &body_cfg[finding.sink]; let sink_has_ssrf = sink_info .taint .labels .iter() .any(|l| matches!(l, DataLabel::Sink(c) if c.contains(Cap::SSRF))); if sink_has_ssrf && let Some(ref callee) = sink_info.call.callee && (callee.ends_with("redirect") || callee.ends_with("Redirect")) && crate::cfg_analysis::guards::has_redirect_path_prefix( self.source.bytes, sink_info.ast.span, ) { continue; } out.push(build_taint_diag( finding, body_cfg, &self.source.tree, self.source.path, self.source.bytes, scan_root, )); } // ── CFG structural analyses (per body) ───────────────────────── let taint_active = global_summaries.is_some() || !taint_results.is_empty(); for body in &self.file_cfg.bodies { let body_taint: Vec<_> = taint_results .iter() .filter(|f| f.body_id == body.meta.id) .cloned() .collect(); let body_const_facts = cfg_analysis::build_body_const_facts(body, caller_lang); let cfg_ctx = cfg_analysis::AnalysisContext { cfg: &body.graph, entry: body.entry, lang: caller_lang, file_path: &self.source.file_path_str, source_bytes: self.source.bytes, func_summaries: self.local_summaries(), global_summaries, taint_findings: &body_taint, analysis_rules: self.rules_ref(), taint_active, body_const_facts: body_const_facts.as_ref(), type_facts: body_const_facts.as_ref().map(|f| &f.type_facts), auth_decorators: &body.meta.auth_decorators, }; for cf in cfg_analysis::run_all(&cfg_ctx) { let point = byte_offset_to_point(&self.source.tree, cf.span.0); let cfg_confidence = Some(match cf.confidence { cfg_analysis::Confidence::High => crate::evidence::Confidence::High, cfg_analysis::Confidence::Medium => crate::evidence::Confidence::Medium, cfg_analysis::Confidence::Low => crate::evidence::Confidence::Low, }); out.push(Diag { path: self.source.path.to_string_lossy().into_owned(), line: point.row + 1, col: point.column + 1, severity: cf.severity, id: cf.rule_id, category: FindingCategory::Security, path_validated: false, guard_kind: None, message: Some(cf.message), labels: vec![], confidence: cfg_confidence, evidence: Some(Evidence { source: None, sink: Some(SpanEvidence { path: self.source.path.to_string_lossy().into_owned(), line: (point.row + 1) as u32, col: (point.column + 1) as u32, kind: "sink".into(), snippet: None, }), guards: vec![], sanitizers: vec![], state: None, notes: vec![], ..Default::default() }), rank_score: None, rank_reason: None, suppressed: false, suppression: None, rollup: None, finding_id: String::new(), alternative_finding_ids: Vec::new(), }); } } // end for body in bodies (CFG structural analyses) // ── State-model dataflow analysis (per body) ───────────────────── if cfg.scanner.enable_state_analysis { let resource_method_summaries = state::build_resource_method_summaries(&self.file_cfg.bodies, caller_lang); let mut all_state_findings = Vec::new(); for body in &self.file_cfg.bodies { let state_findings = state::run_state_analysis( &body.graph, body.entry, caller_lang, self.source.bytes, self.local_summaries(), global_summaries, cfg.scanner.enable_auth_analysis, &resource_method_summaries, &body.meta.auth_decorators, &path_safe_suppressed_spans, ); for sf in &state_findings { let point = byte_offset_to_point(&self.source.tree, sf.span.0); out.push(Diag { path: self.source.path.to_string_lossy().into_owned(), line: point.row + 1, col: point.column + 1, severity: sf.severity, id: sf.rule_id.clone(), category: FindingCategory::Security, path_validated: false, guard_kind: None, message: Some(sf.message.clone()), labels: vec![], confidence: None, evidence: Some(Evidence { source: None, sink: Some(SpanEvidence { path: self.source.path.to_string_lossy().into_owned(), line: (point.row + 1) as u32, col: (point.column + 1) as u32, kind: "sink".into(), snippet: None, }), guards: vec![], sanitizers: vec![], state: Some(StateEvidence { machine: sf.machine.into(), subject: sf.subject.clone(), from_state: sf.from_state.into(), to_state: sf.to_state.into(), }), notes: vec![], ..Default::default() }), rank_score: None, rank_reason: None, suppressed: false, suppression: None, rollup: None, finding_id: String::new(), alternative_finding_ids: Vec::new(), }); } all_state_findings.extend(state_findings); } // end for body in bodies (state analysis) // Suppress cfg-resource-leak / cfg-auth-gap when state analysis // already covers the same line (state analysis is more precise). let state_lines: std::collections::HashSet = all_state_findings .iter() .map(|sf| byte_offset_to_point(&self.source.tree, sf.span.0).row + 1) .collect(); if !all_state_findings.is_empty() { out.retain(|d| { !((d.id == "cfg-resource-leak" || d.id == "cfg-auth-gap") && state_lines.contains(&d.line)) }); } } out } /// Run AST-backed authorization analyses that do not require CFG construction. fn run_auth_analyses( &self, cfg: &Config, global_summaries: Option<&GlobalSummaries>, scan_root: Option<&Path>, ) -> Vec { // Harvest SSA-derived variable types across every body in the // file so `run_auth_analysis` can refine sink classification by // receiver type (e.g. `HttpClient::send` → `OutboundNetwork`, // `HashMap::new`-bound var → `InMemoryLocal`). let var_types = self.collect_file_var_types(); auth_analysis::run_auth_analysis( &self.source.tree, self.source.bytes, self.source.lang_slug, self.source.path, cfg, var_types.as_ref(), global_summaries, scan_root, ) } /// Build a per-file `var_name → TypeKind` map by running SSA + type /// facts on each body and copying type facts for SSA values whose /// definition recorded a source-level variable name. When the same /// name resolves to different non-`Unknown` types across bodies the /// entry is dropped — absence is safe because the auth analysis /// sink gate simply falls back to its syntactic heuristics. Returns /// `None` when no body produces any typed variable (non-Rust files /// currently emit few `LocalCollection` / security-typed facts, but /// this path is language-agnostic). fn collect_file_var_types(&self) -> Option { let caller_lang = Lang::from_slug(self.source.lang_slug).unwrap_or(Lang::Rust); let mut merged: std::collections::HashMap = std::collections::HashMap::new(); let mut dropped: std::collections::HashSet = std::collections::HashSet::new(); for body in &self.file_cfg.bodies { let Some(facts) = cfg_analysis::build_body_const_facts(body, caller_lang) else { continue; }; for (idx, def) in facts.ssa.value_defs.iter().enumerate() { let Some(name) = def.var_name.as_ref() else { continue; }; let Some(ty) = facts.type_facts.get_type(crate::ssa::SsaValue(idx as u32)) else { continue; }; if matches!(ty, crate::ssa::type_facts::TypeKind::Unknown) { continue; } if dropped.contains(name) { continue; } match merged.get(name) { Some(existing) if existing == ty => {} Some(_) => { merged.remove(name); dropped.insert(name.clone()); } None => { merged.insert(name.clone(), ty.clone()); } } } } if merged.is_empty() { None } else { Some(merged) } } } // ───────────────────────────────────────────────────────────────────────────── // Pass 1: Extract function summaries (no taint analysis) // ───────────────────────────────────────────────────────────────────────────── /// Extract function summaries from pre-read bytes. /// /// This is the core **pass 1** implementation. Callers that already hold the /// file contents should use this variant to avoid a redundant `fs::read`. pub fn extract_summaries_from_bytes( bytes: &[u8], path: &Path, cfg: &Config, ) -> NyxResult> { let _span = tracing::debug_span!("extract_summaries", file = %path.display()).entered(); let Some(source) = ParsedSource::try_new(bytes, path)? else { return Ok(vec![]); }; let parsed = ParsedFile::from_source(source, cfg); Ok(parsed.export_summaries()) } /// Like [`extract_summaries_from_bytes`] but forwards `scan_root` so Rust /// summaries carry their crate-relative module path. pub fn extract_summaries_from_bytes_with_root( bytes: &[u8], path: &Path, cfg: &Config, scan_root: Option<&Path>, ) -> NyxResult> { let _span = tracing::debug_span!("extract_summaries", file = %path.display()).entered(); let Some(source) = ParsedSource::try_new(bytes, path)? else { return Ok(vec![]); }; let parsed = ParsedFile::from_source(source, cfg); Ok(parsed.export_summaries_with_root(scan_root)) } /// Convenience wrapper that reads the file then delegates to /// [`extract_summaries_from_bytes`]. #[allow(dead_code)] // used by benchmarks and lib consumers pub fn extract_summaries_from_file(path: &Path, cfg: &Config) -> NyxResult> { let bytes = std::fs::read(path)?; extract_summaries_from_bytes(&bytes, path, cfg) } /// Build a CFG from a file and return the graph, entry node, function summaries, /// and language. /// /// Returns `None` for binary files or unsupported languages. /// Intended for benchmarks and isolated testing of state analysis. pub fn build_cfg_for_file(path: &Path, cfg: &Config) -> NyxResult> { let bytes = std::fs::read(path)?; let Some(source) = ParsedSource::try_new(&bytes, path)? else { return Ok(None); }; let lang = Lang::from_slug(source.lang_slug).unwrap_or(Lang::C); let parsed = ParsedFile::from_source(source, cfg); Ok(Some((parsed.file_cfg, lang))) } /// Extract both `FuncSummary` and `SsaFuncSummary` from pre-read bytes. /// /// This is the shared pass-1 pipeline for indexed scans: parses once, builds /// CFG once, and returns both summary types. Uses the same `ParsedFile` /// pipeline as `analyse_file_fused` — no divergent extraction path. pub fn extract_all_summaries_from_bytes( bytes: &[u8], path: &Path, cfg: &Config, scan_root: Option<&Path>, ) -> NyxResult<( Vec, Vec<(crate::symbol::FuncKey, SsaFuncSummary)>, Vec<( crate::symbol::FuncKey, crate::taint::ssa_transfer::CalleeSsaBody, )>, Vec<( crate::symbol::FuncKey, auth_analysis::model::AuthCheckSummary, )>, )> { let _span = tracing::debug_span!("extract_all_summaries", file = %path.display()).entered(); let Some(source) = ParsedSource::try_new(bytes, path)? else { return Ok((vec![], vec![], vec![], vec![])); }; let lang_slug = source.lang_slug; let parsed = ParsedFile::from_source(source, cfg); let func_summaries = parsed.export_summaries_with_root(scan_root); let (ssa_summaries, ssa_bodies) = parsed.extract_ssa_artifacts(None, scan_root); let auth_summaries = auth_analysis::extract_auth_summaries_by_key( &parsed.source.tree, parsed.source.bytes, lang_slug, parsed.source.path, cfg, scan_root, ); Ok((func_summaries, ssa_summaries, ssa_bodies, auth_summaries)) } // ───────────────────────────────────────────────────────────────────────────── // Constant-argument suppression helper // ───────────────────────────────────────────────────────────────────────────── /// Returns `true` when the captured call node has only literal arguments /// (string, number, boolean, null/nil/none). Used to suppress AST pattern /// findings on provably-constant calls like `os.system("echo health-ok")`. /// /// Conservative: returns `false` whenever the tree structure is unclear or /// any argument is non-literal (including interpolated strings). fn is_call_all_args_literal(node: tree_sitter::Node, bytes: &[u8]) -> bool { // Walk upwards from the captured node to find the closest call_expression // (or similar) ancestor, then locate its argument list child. let call_node = find_enclosing_call(node); let call_node = match call_node { Some(n) => n, None => return false, }; // Find the argument_list / arguments child of the call node. let arg_list = find_arg_list(call_node); let arg_list = match arg_list { Some(n) => n, None => return false, }; let mut has_any_arg = false; for i in 0..arg_list.named_child_count() as u32 { let child = match arg_list.named_child(i) { Some(c) => c, None => continue, }; has_any_arg = true; if !is_literal_node(child, bytes) { return false; } } // If the argument list is empty (no args), we conservatively do NOT // suppress — the danger may come from side effects, not arguments. has_any_arg } /// Walk up to find a call-expression-like ancestor of the captured node. /// Stops at statement/block boundaries to avoid matching unrelated outer calls. fn find_enclosing_call(mut node: tree_sitter::Node) -> Option { // The captured node may already be the call, or it could be the callee // identifier inside a call_expression. Walk up a few levels. for _ in 0..4 { let kind = node.kind(); if kind.contains("call") && !kind.contains("callee") { return Some(node); } // PHP: function_call_expression if kind == "function_call_expression" { return Some(node); } // Stop at scope/statement boundaries — don't cross into outer calls if kind.contains("block") || kind.contains("body") || kind == "program" || kind == "module" || kind == "expression_statement" { return None; } node = node.parent()?; } None } /// Find the argument-list child of a call node across languages. fn find_arg_list(call: tree_sitter::Node) -> Option { for i in 0..call.child_count() as u32 { if let Some(child) = call.child(i) { let kind = child.kind(); // Common argument list node kinds across languages: // Python/JS/TS/Java/Go/C/C++/Rust: argument_list / arguments // PHP: arguments // Ruby: argument_list if kind == "argument_list" || kind == "arguments" || kind == "actual_parameters" { return Some(child); } } } None } /// Check if a tree-sitter node represents a literal value. fn is_literal_node(node: tree_sitter::Node, bytes: &[u8]) -> bool { let kind = node.kind(); match kind { // String literals (most languages) "string" | "string_literal" | "interpreted_string_literal" | "raw_string_literal" | "string_content" | "string_fragment" => true, // Numeric literals "integer" | "integer_literal" | "int_literal" | "float" | "float_literal" | "number" => { true } // Boolean / null / nil / none "true" | "false" | "null" | "nil" | "none" | "null_literal" | "boolean" | "boolean_literal" => true, // PHP encapsed_string: safe only if it has no variable interpolation "encapsed_string" => { // If it contains `$` variable interpolation nodes, it's not literal !has_interpolation(node) } // Wrapper nodes: PHP wraps each arg in an `argument` node, // Go uses `argument` too. Unwrap and check the inner value. "argument" => { node.named_child_count() == 1 && node .named_child(0) .is_some_and(|c| is_literal_node(c, bytes)) } // Unary minus on a number literal: `-42` "unary_expression" | "unary_op" => { node.named_child_count() == 1 && node .named_child(0) .is_some_and(|c| is_literal_node(c, bytes)) } // String concatenation of literals: `"a" + "b"` or `"a" . "b"` "binary_expression" | "concatenated_string" => { node.named_child_count() >= 2 && (0..node.named_child_count() as u32).all(|i| { node.named_child(i) .is_some_and(|c| is_literal_node(c, bytes)) }) } _ => false, } } /// Check if a string node contains interpolation (e.g., PHP `"Hello $name"`). fn has_interpolation(node: tree_sitter::Node) -> bool { for i in 0..node.child_count() as u32 { if let Some(child) = node.child(i) { let kind = child.kind(); if kind == "variable_name" || kind == "simple_variable" || kind.contains("interpolation") { return true; } } } false } // ───────────────────────────────────────────────────────────────────────────── // Layer B: AST pattern suppression when taint confirms safety // ───────────────────────────────────────────────────────────────────────────── /// Map the second segment of a pattern ID (e.g. "cmdi" from "py.cmdi.os_system") /// to the `Cap` that taint analysis models. Returns `None` for categories taint /// cannot subsume (memory safety, crypto, etc.), so those patterns are never suppressed. fn pattern_category_cap(pattern_id: &str) -> Option { let category = pattern_id.split('.').nth(1)?; match category { "cmdi" => Some(Cap::SHELL_ESCAPE), "xss" => Some(Cap::HTML_ESCAPE), "sqli" => Some(Cap::SQL_QUERY), "code_exec" => Some(Cap::CODE_EXEC), "ssrf" => Some(Cap::SSRF), "path" => Some(Cap::FILE_IO), // deser/memory/crypto: taint cannot fully subsume these structural patterns _ => None, } } /// Suppression context built from CFG + taint results. Used to decide whether /// an AST pattern finding can be safely suppressed because taint analysis /// evaluated the data flow and found it safe. struct TaintSuppressionCtx { /// For each function scope, the set of lines containing Source-labeled nodes. source_lines_by_func: HashMap, HashSet>, /// For each sink node line, its enclosing function scope. sink_func_at_line: HashMap>, /// Lines where taint emitted a `taint-unsanitised-flow` finding. taint_finding_lines: HashSet, } impl TaintSuppressionCtx { /// Build suppression context from ALL per-body CFG graphs, tree (for /// byte→line mapping), and existing taint findings. /// /// Scans every body's graph (not just top-level) so that Source/Sink /// nodes inside function bodies are visible for suppression decisions. fn build(file_cfg: &FileCfg, tree: &tree_sitter::Tree, taint_diags: &[Diag]) -> Self { let mut source_lines_by_func: HashMap, HashSet> = HashMap::new(); let mut sink_func_at_line: HashMap> = HashMap::new(); for body in &file_cfg.bodies { for idx in body.graph.node_indices() { let info = &body.graph[idx]; let mut has_source = false; let mut has_sink = false; for label in &info.taint.labels { match label { DataLabel::Source(_) => has_source = true, DataLabel::Sink(_) => has_sink = true, _ => {} } } let byte = info.classification_span().0; let point = byte_offset_to_point(tree, byte); let line = point.row + 1; if has_source { source_lines_by_func .entry(info.ast.enclosing_func.clone()) .or_default() .insert(line); } if has_sink { sink_func_at_line.insert(line, info.ast.enclosing_func.clone()); } } } let taint_finding_lines: HashSet = taint_diags .iter() .filter(|d| d.id.starts_with("taint-unsanitised-flow")) .map(|d| d.line) .collect(); Self { source_lines_by_func, sink_func_at_line, taint_finding_lines, } } /// Returns `true` if this AST pattern finding should be suppressed. fn should_suppress(&self, pattern_id: &str, line: usize) -> bool { // Condition 1: pattern category maps to a Cap taint models if pattern_category_cap(pattern_id).is_none() { return false; } // Condition 2: at least one Source exists in the same function scope // at an EARLIER line (upstream in control flow). This prevents suppression // when the only Source is co-located (dual-label) or downstream from the // sink, since taint couldn't have evaluated a flow that doesn't exist. if let Some(func) = self.sink_func_at_line.get(&line) { match self.source_lines_by_func.get(func) { Some(source_lines) => { if !source_lines.iter().any(|&sl| sl < line) { return false; } } None => return false, } } else { // No CFG sink at this line — taint had no opportunity to evaluate return false; } // Condition 3: no taint finding at this line (taint found it safe) !self.taint_finding_lines.contains(&line) } } // ───────────────────────────────────────────────────────────────────────────── // Pass 2 / single‑file: Full rule execution (AST queries + taint) // ───────────────────────────────────────────────────────────────────────────── /// Run all enabled analyses on pre-read bytes and return diagnostics. /// /// This is the core **pass 2** implementation. Callers that already hold the /// file contents should use this variant to avoid a redundant `fs::read`. pub fn run_rules_on_bytes( bytes: &[u8], path: &Path, cfg: &Config, global_summaries: Option<&GlobalSummaries>, scan_root: Option<&Path>, ) -> NyxResult> { let _span = tracing::debug_span!("run_rules", file = %path.display()).entered(); maybe_inject_test_panic(path); let Some(source) = ParsedSource::try_new(bytes, path)? else { // Not a recognized tree-sitter language — try text-based patterns, // but first surface a parse-timeout synthetic diag if that's what // caused try_new to return None. let mut out = scan_text_based_patterns(bytes, path, cfg); if let Some(timeout_ms) = take_last_parse_timeout_ms() { out.push(parse_timeout_diag(path, timeout_ms)); } return Ok(out); }; let mut out = Vec::new(); // CFG construction + taint + cfg_analysis only needed for CFG-capable modes. let needs_cfg = matches!( cfg.scanner.mode, AnalysisMode::Full | AnalysisMode::Cfg | AnalysisMode::Taint ); if needs_cfg { let parsed = ParsedFile::from_source(source, cfg); out.extend(parsed.run_cfg_analyses(cfg, global_summaries, scan_root)); if cfg.scanner.mode == AnalysisMode::Full { // Layer B: suppress AST findings where taint confirmed safety let suppression = TaintSuppressionCtx::build(&parsed.file_cfg, &parsed.source.tree, &out); let ast_findings = parsed.source.run_ast_queries(cfg); out.extend( ast_findings .into_iter() .filter(|d| !suppression.should_suppress(&d.id, d.line)), ); } if cfg.scanner.mode == AnalysisMode::Full { out.extend(parsed.run_auth_analyses(cfg, global_summaries, scan_root)); } parsed.source.finalize_diags(&mut out, cfg); } else { // AST-only: no CFG construction (fast path preserved) out.extend(source.run_ast_queries(cfg)); let parsed = ParsedFile::from_source(source, cfg); out.extend(parsed.run_auth_analyses(cfg, global_summaries, scan_root)); parsed.source.finalize_diags(&mut out, cfg); } Ok(out) } /// Convenience wrapper that reads the file then delegates to /// [`run_rules_on_bytes`]. pub fn run_rules_on_file( path: &Path, cfg: &Config, global_summaries: Option<&GlobalSummaries>, scan_root: Option<&Path>, ) -> NyxResult> { let bytes = std::fs::read(path)?; run_rules_on_bytes(&bytes, path, cfg, global_summaries, scan_root) } // ───────────────────────────────────────────────────────────────────────────── // Fused single-pass: extract summaries + run full analysis in one parse/CFG // ───────────────────────────────────────────────────────────────────────────── /// Result of a fused analysis pass: both function summaries and diagnostics. pub struct FusedResult { pub summaries: Vec, pub diags: Vec, /// SSA-derived per-parameter summaries keyed by canonical /// [`crate::symbol::FuncKey`]. Keys preserve `(lang, namespace, /// container, name, arity, disambig, kind)` so two same-name definitions /// in the same file never collide. pub ssa_summaries: Vec<(crate::symbol::FuncKey, SsaFuncSummary)>, pub cfg_nodes: usize, /// Eligible callee bodies for cross-file symex, keyed by /// canonical [`crate::symbol::FuncKey`] (same identity model as /// `ssa_summaries`). pub ssa_bodies: Vec<( crate::symbol::FuncKey, crate::taint::ssa_transfer::CalleeSsaBody, )>, /// Per-function auth-check summaries for cross-file helper /// lifting. One entry per analysis unit whose body proves at /// least one positional parameter under an ownership / membership /// / admin / authorization check; empty for files with no such /// helpers. pub auth_summaries: Vec<( crate::symbol::FuncKey, auth_analysis::model::AuthCheckSummary, )>, } /// Parse the file once, build the CFG once, and produce both function /// summaries (for cross-file resolution) and full diagnostics (AST analyses + /// taint + CFG structural analyses). /// /// When `global_summaries` is `None`, the taint engine runs with local /// context only (equivalent to pass 1 + partial pass 2). A second call /// to [`run_taint_only`] can refine findings with the full cross-file view /// without re-parsing or re-building the CFG. pub fn analyse_file_fused( bytes: &[u8], path: &Path, cfg: &Config, global_summaries: Option<&GlobalSummaries>, scan_root: Option<&Path>, ) -> NyxResult { let _span = tracing::debug_span!("analyse_fused", file = %path.display()).entered(); maybe_inject_test_panic(path); let Some(source) = ParsedSource::try_new(bytes, path)? else { // Not a recognized tree-sitter language — try text-based patterns, // and surface a parse-timeout synthetic diag if that's what caused // try_new to return None. let mut diags = scan_text_based_patterns(bytes, path, cfg); if let Some(timeout_ms) = take_last_parse_timeout_ms() { diags.push(parse_timeout_diag(path, timeout_ms)); } return Ok(FusedResult { summaries: vec![], diags, ssa_summaries: vec![], cfg_nodes: 0, ssa_bodies: vec![], auth_summaries: vec![], }); }; let parsed = ParsedFile::from_source(source, cfg); let cfg_nodes = parsed.cfg_graph().node_count(); let summaries = parsed.export_summaries_with_root(scan_root); let mut out = Vec::new(); let needs_cfg = matches!( cfg.scanner.mode, AnalysisMode::Full | AnalysisMode::Cfg | AnalysisMode::Taint ); let (ssa_summaries, ssa_bodies) = if needs_cfg { out.extend(parsed.run_cfg_analyses(cfg, global_summaries, scan_root)); parsed.extract_ssa_artifacts(global_summaries, scan_root) } else { (vec![], vec![]) }; if cfg.scanner.mode == AnalysisMode::Full || cfg.scanner.mode == AnalysisMode::Ast { let ast_findings = parsed.source.run_ast_queries(cfg); // Layer B only applies when taint had the opportunity to evaluate if needs_cfg && cfg.scanner.mode == AnalysisMode::Full { let suppression = TaintSuppressionCtx::build(&parsed.file_cfg, &parsed.source.tree, &out); out.extend( ast_findings .into_iter() .filter(|d| !suppression.should_suppress(&d.id, d.line)), ); } else { out.extend(ast_findings); } out.extend(parsed.run_auth_analyses(cfg, global_summaries, scan_root)); } parsed.source.finalize_diags(&mut out, cfg); let auth_summaries = if cfg.scanner.mode == AnalysisMode::Full { auth_analysis::extract_auth_summaries_by_key( &parsed.source.tree, parsed.source.bytes, parsed.source.lang_slug, parsed.source.path, cfg, scan_root, ) } else { Vec::new() }; Ok(FusedResult { summaries, diags: out, ssa_summaries, cfg_nodes, ssa_bodies, auth_summaries, }) } // ───────────────────────────────────────────────────────────────────────────── // Text-based pattern scanning (non-tree-sitter files) // ───────────────────────────────────────────────────────────────────────────── /// Run text-based pattern scanners on files whose extension is not supported /// by tree-sitter. Currently handles `.ejs` templates. fn scan_text_based_patterns(bytes: &[u8], path: &Path, cfg: &Config) -> Vec { let ext = lowercase_ext(path); match ext { Some("ejs") => { let mut diags = crate::patterns::ejs::scan_ejs_file(path, bytes); // Respect severity filter diags.retain(|d| d.severity <= cfg.scanner.min_severity); diags } _ => vec![], } } #[test] fn unknown_extension_returns_empty() { let dir = tempfile::tempdir().unwrap(); let txt = dir.path().join("notes.txt"); std::fs::write(&txt, "just some text").unwrap(); let diags = run_rules_on_file(&txt, &Config::default(), None, None) .expect("function should never error on plain text"); assert!(diags.is_empty()); } #[test] fn binary_file_guard_triggers() { let dir = tempfile::tempdir().unwrap(); let bin = dir.path().join("junk.bin"); let mut data = vec![0_u8; 2048]; for i in (0..data.len()).step_by(3) { data[i] = 0; } std::fs::write(&bin, &data).unwrap(); let diags = run_rules_on_file(&bin, &Config::default(), None, None).unwrap(); assert!(diags.is_empty(), "binary files are skipped"); } #[test] fn nonprod_path_detection() { // Test that is_nonprod_path recognises common non-production paths assert!(is_nonprod_path(Path::new("project/tests/test_main.py"))); assert!(is_nonprod_path(Path::new("src/__tests__/foo.js"))); assert!(is_nonprod_path(Path::new("benches/bench.rs"))); assert!(is_nonprod_path(Path::new("vendor/lib/foo.py"))); assert!(is_nonprod_path(Path::new("src/build.rs"))); assert!(is_nonprod_path(Path::new("dist/app.min.js"))); assert!(is_nonprod_path(Path::new("examples/demo.py"))); assert!(is_nonprod_path(Path::new("fixtures/data.json"))); // Should NOT match production paths assert!(!is_nonprod_path(Path::new("src/main.rs"))); assert!(!is_nonprod_path(Path::new("lib/handler.py"))); assert!(!is_nonprod_path(Path::new("app/views.py"))); } #[test] fn severity_downgrade_works() { assert_eq!(downgrade_severity(Severity::High), Severity::Medium); assert_eq!(downgrade_severity(Severity::Medium), Severity::Low); assert_eq!(downgrade_severity(Severity::Low), Severity::Low); } #[test] fn nonprod_path_downgrades_findings() { let dir = tempfile::tempdir().unwrap(); // Create a file under a "tests" directory let test_dir = dir.path().join("tests"); std::fs::create_dir_all(&test_dir).unwrap(); let test_file = test_dir.join("test_cmd.py"); std::fs::write( &test_file, b"import os\ndef test():\n cmd = os.environ['X']\n os.system(cmd)\n", ) .unwrap(); let default_cfg = Config::default(); let diags = run_rules_on_file(&test_file, &default_cfg, None, None).unwrap(); // All findings in tests/ should be downgraded (no HIGH) let high: Vec<_> = diags .iter() .filter(|d| d.severity == Severity::High) .collect(); assert!( high.is_empty(), "Findings in tests/ should be downgraded from HIGH; got {:?}", high ); // With include_nonprod=true, original severity preserved let mut prod_cfg = Config::default(); prod_cfg.scanner.include_nonprod = true; let diags_prod = run_rules_on_file(&test_file, &prod_cfg, None, None).unwrap(); // Not all diagnostics are necessarily high, but include_nonprod should not downgrade // Just verify that if there are findings, they weren't downgraded by the nonprod logic let _ = diags_prod; } #[test] fn constant_arg_suppression_works() { use tree_sitter::StreamingIterator; // PHP: system("echo health-ok") should be suppressed { let mut parser = tree_sitter::Parser::new(); let lang = tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP); parser.set_language(&lang).unwrap(); let code = b"