use crate::cfg::{build_cfg, export_summaries}; use crate::cfg_analysis; use crate::commands::scan::Diag; use crate::errors::{NyxError, NyxResult}; use crate::labels::{build_lang_rules, severity_for_source_kind}; use crate::patterns::Severity; use crate::summary::{FuncSummary, GlobalSummaries}; use crate::symbol::{Lang, normalize_namespace}; use crate::taint::analyse_file; use crate::utils::config::AnalysisMode; use crate::utils::ext::lowercase_ext; use crate::utils::{Config, query_cache}; use std::cell::RefCell; use std::path::Path; use tree_sitter::{Language, QueryCursor, StreamingIterator}; thread_local! { static PARSER: RefCell = RefCell::new(tree_sitter::Parser::new()); } /// Convenience alias for node indices. fn byte_offset_to_point(tree: &tree_sitter::Tree, byte: usize) -> tree_sitter::Point { tree.root_node() .descendant_for_byte_range(byte, byte) .map(|n| n.start_position()) .unwrap_or_else(|| tree_sitter::Point { row: 0, column: 0 }) } /// Resolve a file extension to a (tree‑sitter Language, slug) pair. fn lang_for_path(path: &Path) -> Option<(Language, &'static str)> { match lowercase_ext(path) { Some("rs") => Some((Language::from(tree_sitter_rust::LANGUAGE), "rust")), Some("c") => Some((Language::from(tree_sitter_c::LANGUAGE), "c")), Some("cpp") => Some((Language::from(tree_sitter_cpp::LANGUAGE), "cpp")), Some("java") => Some((Language::from(tree_sitter_java::LANGUAGE), "java")), Some("go") => Some((Language::from(tree_sitter_go::LANGUAGE), "go")), Some("php") => Some((Language::from(tree_sitter_php::LANGUAGE_PHP), "php")), Some("py") => Some((Language::from(tree_sitter_python::LANGUAGE), "python")), Some("ts") => Some(( Language::from(tree_sitter_typescript::LANGUAGE_TYPESCRIPT), "typescript", )), Some("js") => Some(( Language::from(tree_sitter_javascript::LANGUAGE), "javascript", )), Some("rb") => Some((Language::from(tree_sitter_ruby::LANGUAGE), "ruby")), _ => None, } } /// Fast binary-file guard: skip if >1% NUL bytes. fn is_binary(bytes: &[u8]) -> bool { bytes.iter().filter(|b| **b == 0).count() * 100 / bytes.len().max(1) > 1 } /// Check if a file path belongs to a non-production context (tests, vendor, /// benchmarks, etc.). Used to downgrade severity for findings in paths that /// are unlikely to represent attack surface. fn is_nonprod_path(path: &Path) -> bool { static NONPROD_DIRS: &[&str] = &[ "tests", "test", "__tests__", "benches", "benchmarks", "examples", "build", "scripts", "docs", "js_tests", "fixtures", "vendor", ]; static NONPROD_FILES: &[&str] = &["build.rs"]; if let Some(name) = path.file_name().and_then(|n| n.to_str()) && (NONPROD_FILES.contains(&name) || name.ends_with(".min.js")) { return true; } for component in path.components() { if let std::path::Component::Normal(c) = component && let Some(s) = c.to_str() && NONPROD_DIRS.contains(&s) { return true; } } false } /// Downgrade severity by one tier: High→Medium, Medium→Low, Low→Low. fn downgrade_severity(s: Severity) -> Severity { match s { Severity::High => Severity::Medium, Severity::Medium => Severity::Low, Severity::Low => Severity::Low, } } // ───────────────────────────────────────────────────────────────────────────── // Pass 1: Extract function summaries (no taint analysis) // ───────────────────────────────────────────────────────────────────────────── /// Extract function summaries from pre-read bytes. /// /// This is the core **pass 1** implementation. Callers that already hold the /// file contents should use this variant to avoid a redundant `fs::read`. pub fn extract_summaries_from_bytes( bytes: &[u8], path: &Path, _cfg: &Config, ) -> NyxResult> { let _span = tracing::debug_span!("extract_summaries", file = %path.display()).entered(); if is_binary(bytes) { return Ok(vec![]); } let Some((ts_lang, lang_slug)) = lang_for_path(path) else { return Ok(vec![]); }; let tree = PARSER.with(|cell| { let mut parser = cell.borrow_mut(); parser.set_language(&ts_lang)?; parser .parse(bytes, None) .ok_or_else(|| NyxError::Other("tree-sitter failed".into())) })?; let file_path_str = path.to_string_lossy(); let lang_rules = build_lang_rules(_cfg, lang_slug); let rules_ref = if lang_rules.extra_labels.is_empty() && lang_rules.terminators.is_empty() && lang_rules.event_handlers.is_empty() { None } else { Some(&lang_rules) }; let (_cfg_graph, _entry, local_summaries) = build_cfg(&tree, bytes, lang_slug, &file_path_str, rules_ref); Ok(export_summaries( &local_summaries, &file_path_str, lang_slug, )) } /// Convenience wrapper that reads the file then delegates to /// [`extract_summaries_from_bytes`]. #[allow(dead_code)] // used by benchmarks and lib consumers pub fn extract_summaries_from_file(path: &Path, cfg: &Config) -> NyxResult> { let bytes = std::fs::read(path)?; extract_summaries_from_bytes(&bytes, path, cfg) } // ───────────────────────────────────────────────────────────────────────────── // Pass 2 / single‑file: Full rule execution (AST queries + taint) // ───────────────────────────────────────────────────────────────────────────── /// Run all enabled analyses on pre-read bytes and return diagnostics. /// /// This is the core **pass 2** implementation. Callers that already hold the /// file contents should use this variant to avoid a redundant `fs::read`. pub fn run_rules_on_bytes( bytes: &[u8], path: &Path, cfg: &Config, global_summaries: Option<&GlobalSummaries>, scan_root: Option<&Path>, ) -> NyxResult> { let _span = tracing::debug_span!("run_rules", file = %path.display()).entered(); if is_binary(bytes) { return Ok(vec![]); } let Some((ts_lang, lang_slug)) = lang_for_path(path) else { return Ok(vec![]); }; let _tree = PARSER.with(|cell| { let mut parser = cell.borrow_mut(); parser.set_language(&ts_lang)?; parser .parse(bytes, None) .ok_or_else(|| NyxError::Other("tree-sitter failed".into())) })?; let mut out = Vec::new(); let file_path_str = path.to_string_lossy(); // CFG construction + taint + cfg_analysis only needed for Full/Taint modes. let needs_cfg = cfg.scanner.mode == AnalysisMode::Full || cfg.scanner.mode == AnalysisMode::Taint; if needs_cfg { // Build CFG — needed for both taint analysis and CFG structural analyses. let lang_rules = build_lang_rules(cfg, lang_slug); let rules_ref = if lang_rules.extra_labels.is_empty() && lang_rules.terminators.is_empty() && lang_rules.event_handlers.is_empty() { None } else { Some(&lang_rules) }; let (cfg_graph, entry, summaries) = build_cfg(&_tree, bytes, lang_slug, &file_path_str, rules_ref); let caller_lang = Lang::from_slug(lang_slug).unwrap_or(Lang::Rust); // ── Taint analysis ────────────────────────────────────────────── tracing::debug!("Running taint analysis on: {}", path.display()); tracing::debug!("Func summaries: {:?}", summaries); let scan_root_str = scan_root.map(|p| p.to_string_lossy()); let namespace = normalize_namespace(&file_path_str, scan_root_str.as_deref()); let taint_results = analyse_file( &cfg_graph, entry, &summaries, global_summaries, caller_lang, &namespace, &[], ); for finding in &taint_results { // Report the SINK location — where the vulnerability manifests. let sink_byte = cfg_graph[finding.sink].span.0; let sink_point = byte_offset_to_point(&_tree, sink_byte); // Include source location in the ID so distinct flows through // the same sink (or different sinks at the same line) don't // get collapsed by dedup. let source_byte = cfg_graph[finding.source].span.0; let source_point = byte_offset_to_point(&_tree, source_byte); out.push(Diag { path: path.to_string_lossy().into_owned(), line: sink_point.row + 1, col: sink_point.column + 1, severity: severity_for_source_kind(finding.source_kind), id: format!( "taint-unsanitised-flow (source {}:{})", source_point.row + 1, source_point.column + 1 ), }); } // ── CFG structural analyses ───────────────────────────────────── let taint_active = global_summaries.is_some() || !taint_results.is_empty(); let cfg_ctx = cfg_analysis::AnalysisContext { cfg: &cfg_graph, entry, lang: caller_lang, file_path: &file_path_str, source_bytes: bytes, func_summaries: &summaries, global_summaries, taint_findings: &taint_results, analysis_rules: rules_ref, taint_active, }; for cf in cfg_analysis::run_all(&cfg_ctx) { let point = byte_offset_to_point(&_tree, cf.span.0); out.push(Diag { path: path.to_string_lossy().into_owned(), line: point.row + 1, col: point.column + 1, severity: cf.severity, id: cf.rule_id, }); } } if cfg.scanner.mode == AnalysisMode::Full || cfg.scanner.mode == AnalysisMode::Ast { let root = _tree.root_node(); let compiled = query_cache::for_lang(lang_slug, ts_lang); let mut cursor = QueryCursor::new(); for cq in compiled.iter() { if cfg.scanner.min_severity <= cq.meta.severity { continue; } let mut matches = cursor.matches(&cq.query, root, bytes); while let Some(m) = matches.next() { if let Some(cap) = m.captures.iter().find(|c| c.index == 0) { let point = cap.node.start_position(); out.push(Diag { path: path.to_string_lossy().into_owned(), line: point.row + 1, col: point.column + 1, severity: cq.meta.severity, id: cq.meta.id.to_owned(), }); } } } } // Check to ensure no duplicates out.sort_by(|a, b| (a.line, a.col, &a.id, a.severity).cmp(&(b.line, b.col, &b.id, b.severity))); out.dedup_by(|a, b| { a.line == b.line && a.col == b.col && a.id == b.id && a.severity == b.severity }); // Downgrade severity for non-production paths unless opted out if !cfg.scanner.include_nonprod && is_nonprod_path(path) { for d in &mut out { d.severity = downgrade_severity(d.severity); } } Ok(out) } /// Convenience wrapper that reads the file then delegates to /// [`run_rules_on_bytes`]. pub fn run_rules_on_file( path: &Path, cfg: &Config, global_summaries: Option<&GlobalSummaries>, scan_root: Option<&Path>, ) -> NyxResult> { let bytes = std::fs::read(path)?; run_rules_on_bytes(&bytes, path, cfg, global_summaries, scan_root) } // ───────────────────────────────────────────────────────────────────────────── // Fused single-pass: extract summaries + run full analysis in one parse/CFG // ───────────────────────────────────────────────────────────────────────────── /// Result of a fused analysis pass: both function summaries and diagnostics. pub struct FusedResult { pub summaries: Vec, pub diags: Vec, } /// Parse the file once, build the CFG once, and produce both function /// summaries (for cross-file resolution) and full diagnostics (AST queries + /// taint + CFG structural analyses). /// /// When `global_summaries` is `None`, the taint engine runs with local /// context only (equivalent to pass 1 + partial pass 2). A second call /// to [`run_taint_only`] can refine findings with the full cross-file view /// without re-parsing or re-building the CFG. pub fn analyse_file_fused( bytes: &[u8], path: &Path, cfg: &Config, global_summaries: Option<&GlobalSummaries>, scan_root: Option<&Path>, ) -> NyxResult { let _span = tracing::debug_span!("analyse_fused", file = %path.display()).entered(); if is_binary(bytes) { return Ok(FusedResult { summaries: vec![], diags: vec![], }); } let Some((ts_lang, lang_slug)) = lang_for_path(path) else { return Ok(FusedResult { summaries: vec![], diags: vec![], }); }; let tree = PARSER.with(|cell| { let mut parser = cell.borrow_mut(); parser.set_language(&ts_lang)?; parser .parse(bytes, None) .ok_or_else(|| NyxError::Other("tree-sitter failed".into())) })?; let file_path_str = path.to_string_lossy(); // Build language-specific analysis rules once let lang_rules = build_lang_rules(cfg, lang_slug); let rules_ref = if lang_rules.extra_labels.is_empty() && lang_rules.terminators.is_empty() && lang_rules.event_handlers.is_empty() { None } else { Some(&lang_rules) }; // Build CFG once — used for both summary extraction AND analysis let (cfg_graph, entry, local_summaries) = build_cfg(&tree, bytes, lang_slug, &file_path_str, rules_ref); // Export summaries (always — needed for cross-file merging) let summaries = export_summaries(&local_summaries, &file_path_str, lang_slug); let mut out = Vec::new(); // Taint + CFG structural analyses let needs_cfg = cfg.scanner.mode == AnalysisMode::Full || cfg.scanner.mode == AnalysisMode::Taint; if needs_cfg { let caller_lang = Lang::from_slug(lang_slug).unwrap_or(Lang::Rust); let scan_root_str = scan_root.map(|p| p.to_string_lossy()); let namespace = normalize_namespace(&file_path_str, scan_root_str.as_deref()); let taint_results = analyse_file( &cfg_graph, entry, &local_summaries, global_summaries, caller_lang, &namespace, &[], ); for finding in &taint_results { let sink_byte = cfg_graph[finding.sink].span.0; let sink_point = byte_offset_to_point(&tree, sink_byte); let source_byte = cfg_graph[finding.source].span.0; let source_point = byte_offset_to_point(&tree, source_byte); out.push(Diag { path: path.to_string_lossy().into_owned(), line: sink_point.row + 1, col: sink_point.column + 1, severity: severity_for_source_kind(finding.source_kind), id: format!( "taint-unsanitised-flow (source {}:{})", source_point.row + 1, source_point.column + 1 ), }); } let taint_active = global_summaries.is_some() || !taint_results.is_empty(); let cfg_ctx = cfg_analysis::AnalysisContext { cfg: &cfg_graph, entry, lang: caller_lang, file_path: &file_path_str, source_bytes: bytes, func_summaries: &local_summaries, global_summaries, taint_findings: &taint_results, analysis_rules: rules_ref, taint_active, }; for cf in cfg_analysis::run_all(&cfg_ctx) { let point = byte_offset_to_point(&tree, cf.span.0); out.push(Diag { path: path.to_string_lossy().into_owned(), line: point.row + 1, col: point.column + 1, severity: cf.severity, id: cf.rule_id, }); } } // AST pattern queries if cfg.scanner.mode == AnalysisMode::Full || cfg.scanner.mode == AnalysisMode::Ast { let root = tree.root_node(); let compiled = query_cache::for_lang(lang_slug, ts_lang); let mut cursor = QueryCursor::new(); for cq in compiled.iter() { if cfg.scanner.min_severity <= cq.meta.severity { continue; } let mut matches = cursor.matches(&cq.query, root, bytes); while let Some(m) = matches.next() { if let Some(cap) = m.captures.iter().find(|c| c.index == 0) { let point = cap.node.start_position(); out.push(Diag { path: path.to_string_lossy().into_owned(), line: point.row + 1, col: point.column + 1, severity: cq.meta.severity, id: cq.meta.id.to_owned(), }); } } } } // Dedup out.sort_by(|a, b| (a.line, a.col, &a.id, a.severity).cmp(&(b.line, b.col, &b.id, b.severity))); out.dedup_by(|a, b| { a.line == b.line && a.col == b.col && a.id == b.id && a.severity == b.severity }); // Downgrade severity for non-production paths unless opted out if !cfg.scanner.include_nonprod && is_nonprod_path(path) { for d in &mut out { d.severity = downgrade_severity(d.severity); } } Ok(FusedResult { summaries, diags: out, }) } #[test] fn unknown_extension_returns_empty() { let dir = tempfile::tempdir().unwrap(); let txt = dir.path().join("notes.txt"); std::fs::write(&txt, "just some text").unwrap(); let diags = run_rules_on_file(&txt, &Config::default(), None, None) .expect("function should never error on plain text"); assert!(diags.is_empty()); } #[test] fn binary_file_guard_triggers() { let dir = tempfile::tempdir().unwrap(); let bin = dir.path().join("junk.bin"); let mut data = vec![0_u8; 2048]; for i in (0..data.len()).step_by(3) { data[i] = 0; } std::fs::write(&bin, &data).unwrap(); let diags = run_rules_on_file(&bin, &Config::default(), None, None).unwrap(); assert!(diags.is_empty(), "binary files are skipped"); } #[test] fn nonprod_path_detection() { // Test that is_nonprod_path recognises common non-production paths assert!(is_nonprod_path(Path::new("project/tests/test_main.py"))); assert!(is_nonprod_path(Path::new("src/__tests__/foo.js"))); assert!(is_nonprod_path(Path::new("benches/bench.rs"))); assert!(is_nonprod_path(Path::new("vendor/lib/foo.py"))); assert!(is_nonprod_path(Path::new("src/build.rs"))); assert!(is_nonprod_path(Path::new("dist/app.min.js"))); assert!(is_nonprod_path(Path::new("examples/demo.py"))); assert!(is_nonprod_path(Path::new("fixtures/data.json"))); // Should NOT match production paths assert!(!is_nonprod_path(Path::new("src/main.rs"))); assert!(!is_nonprod_path(Path::new("lib/handler.py"))); assert!(!is_nonprod_path(Path::new("app/views.py"))); } #[test] fn severity_downgrade_works() { assert_eq!(downgrade_severity(Severity::High), Severity::Medium); assert_eq!(downgrade_severity(Severity::Medium), Severity::Low); assert_eq!(downgrade_severity(Severity::Low), Severity::Low); } #[test] fn nonprod_path_downgrades_findings() { let dir = tempfile::tempdir().unwrap(); // Create a file under a "tests" directory let test_dir = dir.path().join("tests"); std::fs::create_dir_all(&test_dir).unwrap(); let test_file = test_dir.join("test_cmd.py"); std::fs::write( &test_file, b"import os\ndef test():\n cmd = os.environ['X']\n os.system(cmd)\n", ) .unwrap(); let default_cfg = Config::default(); let diags = run_rules_on_file(&test_file, &default_cfg, None, None).unwrap(); // All findings in tests/ should be downgraded (no HIGH) let high: Vec<_> = diags .iter() .filter(|d| d.severity == Severity::High) .collect(); assert!( high.is_empty(), "Findings in tests/ should be downgraded from HIGH; got {:?}", high ); // With include_nonprod=true, original severity preserved let mut prod_cfg = Config::default(); prod_cfg.scanner.include_nonprod = true; let diags_prod = run_rules_on_file(&test_file, &prod_cfg, None, None).unwrap(); // Not all diagnostics are necessarily high, but include_nonprod should not downgrade // Just verify that if there are findings, they weren't downgraded by the nonprod logic let _ = diags_prod; }