mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-09 19:45:13 +02:00
* chore: Exclude CLAUDE.md from Cargo.toml * feat: Add configurable analysis rules and CLI commands for custom sanitizers and terminators * feat: Enhance resource management and analysis efficiency - Implemented parallel summary merging in `scan_filesystem` using rayon for improved performance. - Introduced `GlobalSummaries::merge()` for efficient merging of summaries. - Optimized file reading and hashing to eliminate redundant I/O operations. - Added `should_scan_with_hash()` and `upsert_file_with_hash()` methods to streamline file processing. - Enhanced taint analysis with in-place mutations to reduce memory allocations. - Updated resource acquisition patterns to exclude false positives for `freopen` and wrapper functions. * feat: Implement severity downgrade for findings in non-production paths and add source kind inference * feat: Update versioning information in SECURITY.md for new stable line * feat: Update categories in Cargo.toml to include parser-implementations and text-processing * feat: Update dependencies in Cargo.lock for improved compatibility and performance * feat: Update dependencies in Cargo.lock and Cargo.toml for improved compatibility
494 lines
18 KiB
Rust
494 lines
18 KiB
Rust
pub(crate) use crate::ast::{
|
||
analyse_file_fused, extract_summaries_from_bytes, run_rules_on_bytes, run_rules_on_file,
|
||
};
|
||
use crate::database::index::{Indexer, IssueRow};
|
||
use crate::errors::NyxResult;
|
||
use crate::patterns::Severity;
|
||
use crate::summary::{self, GlobalSummaries};
|
||
use crate::utils::config::Config;
|
||
use crate::utils::project::get_project_info;
|
||
use crate::walk::spawn_file_walker;
|
||
use console::style;
|
||
use dashmap::DashMap;
|
||
use indicatif::{ProgressBar, ProgressStyle};
|
||
use r2d2::Pool;
|
||
use r2d2_sqlite::SqliteConnectionManager;
|
||
use rayon::prelude::*;
|
||
use std::collections::BTreeMap;
|
||
use std::path::{Path, PathBuf};
|
||
use std::sync::Arc;
|
||
|
||
fn make_progress_bar(len: u64, msg: &str, show: bool) -> ProgressBar {
|
||
if !show {
|
||
return ProgressBar::hidden();
|
||
}
|
||
let pb = ProgressBar::new(len);
|
||
pb.set_style(
|
||
ProgressStyle::with_template(
|
||
"{spinner:.green} {msg} [{bar:30.cyan/blue}] {pos}/{len} ({eta})",
|
||
)
|
||
.unwrap()
|
||
.progress_chars("##-"),
|
||
);
|
||
pb.set_message(msg.to_string());
|
||
pb
|
||
}
|
||
|
||
#[derive(Debug, Clone, serde::Serialize)]
|
||
pub struct Diag {
|
||
pub path: String,
|
||
pub line: usize,
|
||
pub col: usize,
|
||
pub severity: Severity,
|
||
pub id: String,
|
||
}
|
||
|
||
/// Entry point called by the CLI.
|
||
pub fn handle(
|
||
path: &str,
|
||
no_index: bool,
|
||
rebuild_index: bool,
|
||
format: String,
|
||
database_dir: &Path,
|
||
config: &Config,
|
||
) -> NyxResult<()> {
|
||
let scan_path = Path::new(path).canonicalize()?;
|
||
let (project_name, db_path) = get_project_info(&scan_path, database_dir)?;
|
||
|
||
let suppress_status = config.output.quiet || format == "json" || format == "sarif";
|
||
if !suppress_status {
|
||
println!(
|
||
"{} {}...\n",
|
||
style("Checking").green().bold(),
|
||
&project_name
|
||
);
|
||
}
|
||
|
||
let show_progress = format != "json" && format != "sarif" && !config.output.quiet;
|
||
|
||
let diags: Vec<Diag> = if no_index {
|
||
scan_filesystem(&scan_path, config, show_progress)?
|
||
} else {
|
||
if rebuild_index || !db_path.exists() {
|
||
tracing::debug!("Scanning filesystem index filesystem");
|
||
crate::commands::index::build_index(
|
||
&project_name,
|
||
&scan_path,
|
||
&db_path,
|
||
config,
|
||
show_progress,
|
||
)?;
|
||
}
|
||
|
||
let pool = Indexer::init(&db_path)?;
|
||
if config.database.vacuum_on_startup {
|
||
let idx = Indexer::from_pool(&project_name, &pool)?;
|
||
idx.vacuum()?;
|
||
}
|
||
scan_with_index_parallel(&project_name, pool, config, show_progress)?
|
||
};
|
||
|
||
tracing::debug!("Found {:?} issues.", diags.len());
|
||
|
||
if format == "json" {
|
||
let json = serde_json::to_string(&diags)
|
||
.map_err(|e| crate::errors::NyxError::Msg(e.to_string()))?;
|
||
println!("{json}");
|
||
return Ok(());
|
||
}
|
||
|
||
if format == "sarif" {
|
||
let sarif = crate::output::build_sarif(&diags, &scan_path);
|
||
let json = serde_json::to_string_pretty(&sarif)
|
||
.map_err(|e| crate::errors::NyxError::Msg(e.to_string()))?;
|
||
println!("{json}");
|
||
return Ok(());
|
||
}
|
||
|
||
if format == "console" || (format.is_empty() && config.output.default_format == "console") {
|
||
tracing::debug!("Printing to console");
|
||
let mut grouped: BTreeMap<&str, Vec<&Diag>> = BTreeMap::new();
|
||
for d in &diags {
|
||
grouped.entry(&d.path).or_default().push(d);
|
||
}
|
||
|
||
for (path, issues) in &grouped {
|
||
println!("{}", style(path).blue().underlined());
|
||
for d in issues {
|
||
println!(
|
||
" {:>4}:{:<4} {} {}",
|
||
d.line,
|
||
d.col,
|
||
d.severity.colored_tag(),
|
||
style(&d.id).bold()
|
||
);
|
||
}
|
||
println!();
|
||
}
|
||
|
||
println!(
|
||
"{} '{}' generated {} issues.",
|
||
style("warning").yellow().bold(),
|
||
style(project_name).white().bold(),
|
||
style(diags.len()).bold()
|
||
);
|
||
println!("\t");
|
||
}
|
||
Ok(())
|
||
}
|
||
|
||
// --------------------------------------------------------------------------------------------
|
||
// Two‑pass scanning (no index)
|
||
// --------------------------------------------------------------------------------------------
|
||
|
||
/// Walk the filesystem and perform a two‑pass scan:
|
||
///
|
||
/// **Pass 1** – Parse every file and extract function summaries.
|
||
/// **Pass 2** – Re‑parse every file and run taint analysis with the
|
||
/// merged cross‑file summaries.
|
||
///
|
||
/// AST pattern queries are run during pass 2 (they don't depend on summaries).
|
||
pub(crate) fn scan_filesystem(
|
||
root: &Path,
|
||
cfg: &Config,
|
||
show_progress: bool,
|
||
) -> NyxResult<Vec<Diag>> {
|
||
// ── Collect file list ────────────────────────────────────────────────
|
||
let all_paths: Vec<PathBuf> = {
|
||
let _span = tracing::info_span!("walk_files").entered();
|
||
let (rx, handle) = spawn_file_walker(root, cfg);
|
||
// Drain the channel BEFORE joining the walker thread.
|
||
// The channel is bounded, so joining first would deadlock once
|
||
// the walker fills it and blocks on send.
|
||
let paths: Vec<PathBuf> = rx.into_iter().flatten().collect();
|
||
if let Err(err) = handle.join() {
|
||
tracing::error!("walker thread panicked: {:#?}", err);
|
||
}
|
||
paths
|
||
};
|
||
tracing::info!(file_count = all_paths.len(), "file walk complete");
|
||
|
||
let needs_taint = cfg.scanner.mode == crate::utils::config::AnalysisMode::Full
|
||
|| cfg.scanner.mode == crate::utils::config::AnalysisMode::Taint;
|
||
|
||
if !needs_taint {
|
||
// ── AST-only: single fused pass (no cross-file context needed) ──
|
||
let _span = tracing::info_span!("ast_only_analysis", files = all_paths.len()).entered();
|
||
let pb = make_progress_bar(all_paths.len() as u64, "Running analysis", show_progress);
|
||
|
||
let mut diags: Vec<Diag> = all_paths
|
||
.par_iter()
|
||
.flat_map_iter(|path| {
|
||
let result = match analyse_file_fused(
|
||
&std::fs::read(path).unwrap_or_default(),
|
||
path,
|
||
cfg,
|
||
None,
|
||
Some(root),
|
||
) {
|
||
Ok(r) => r.diags,
|
||
Err(e) => {
|
||
tracing::warn!("analysis: {}: {e}", path.display());
|
||
vec![]
|
||
}
|
||
};
|
||
pb.inc(1);
|
||
result
|
||
})
|
||
.collect();
|
||
pb.finish_and_clear();
|
||
|
||
if let Some(max) = cfg.output.max_results {
|
||
diags.truncate(max as usize);
|
||
}
|
||
return Ok(diags);
|
||
}
|
||
|
||
// ── Taint mode: two-pass with fused pass 1 ──────────────────────────
|
||
//
|
||
// Pass 1 (fused): parse + CFG (once!) → extract summaries + run
|
||
// AST queries + local taint + CFG structural analyses.
|
||
// Summaries are collected for the cross-file merge.
|
||
//
|
||
// Pass 2: re-run full analysis with global summaries injected.
|
||
// This requires a second parse+CFG, but ONLY for taint-mode files
|
||
// that need cross-file context. For repos where most functions
|
||
// don't have unresolved callees, pass 1 results are already correct.
|
||
|
||
// ── Pass 1: fused summary extraction + parallel merge ──────────────
|
||
//
|
||
// Each rayon thread builds a local `GlobalSummaries` from its chunk,
|
||
// then the per-thread maps are merged in a binary reduce tree.
|
||
// This eliminates the serial merge_summaries bottleneck.
|
||
let global_summaries: GlobalSummaries = {
|
||
let _span = tracing::info_span!("pass1_fused", files = all_paths.len()).entered();
|
||
let pb = make_progress_bar(
|
||
all_paths.len() as u64,
|
||
"Pass 1: Extracting summaries",
|
||
show_progress,
|
||
);
|
||
let root_str = root.to_string_lossy();
|
||
|
||
let gs = all_paths
|
||
.par_iter()
|
||
.fold(GlobalSummaries::new, |mut local_gs, path| {
|
||
if let Ok(bytes) = std::fs::read(path) {
|
||
match analyse_file_fused(&bytes, path, cfg, None, Some(root)) {
|
||
Ok(r) => {
|
||
for s in r.summaries {
|
||
let key = s.func_key(Some(&root_str));
|
||
local_gs.insert(key, s);
|
||
}
|
||
}
|
||
Err(e) => {
|
||
tracing::warn!("pass 1: {}: {e}", path.display());
|
||
}
|
||
}
|
||
} else {
|
||
tracing::warn!("pass 1: cannot read {}", path.display());
|
||
}
|
||
pb.inc(1);
|
||
local_gs
|
||
})
|
||
.reduce(GlobalSummaries::new, |mut a, b| {
|
||
a.merge(b);
|
||
a
|
||
});
|
||
|
||
pb.finish_and_clear();
|
||
tracing::info!("pass 1 complete");
|
||
gs
|
||
};
|
||
|
||
// ── Pass 2: re-run with cross-file global summaries ──────────────────
|
||
let mut diags: Vec<Diag> = {
|
||
let _span = tracing::info_span!("pass2_analysis", files = all_paths.len()).entered();
|
||
let pb = make_progress_bar(
|
||
all_paths.len() as u64,
|
||
"Pass 2: Running analysis",
|
||
show_progress,
|
||
);
|
||
|
||
let result: Vec<Diag> = all_paths
|
||
.par_iter()
|
||
.flat_map_iter(|path| {
|
||
let result = match run_rules_on_file(path, cfg, Some(&global_summaries), Some(root))
|
||
{
|
||
Ok(d) => d,
|
||
Err(e) => {
|
||
tracing::warn!("pass 2: {}: {e}", path.display());
|
||
vec![]
|
||
}
|
||
};
|
||
pb.inc(1);
|
||
result
|
||
})
|
||
.collect();
|
||
pb.finish_and_clear();
|
||
result
|
||
};
|
||
tracing::info!(diags = diags.len(), "pass 2 complete");
|
||
|
||
if let Some(max) = cfg.output.max_results {
|
||
diags.truncate(max as usize);
|
||
}
|
||
|
||
Ok(diags)
|
||
}
|
||
|
||
// --------------------------------------------------------------------------------------------
|
||
// Two‑pass scanning (with index)
|
||
// --------------------------------------------------------------------------------------------
|
||
|
||
/// Indexed two‑pass scan:
|
||
///
|
||
/// **Pass 1** – For every file that needs scanning, extract summaries and
|
||
/// persist them to the database. Unchanged files keep their
|
||
/// existing summaries.
|
||
/// **Pass 2** – Load *all* summaries from the DB, merge them, and re‑run
|
||
/// taint analysis on every file with the full cross‑file view.
|
||
/// Files whose *own* code has not changed AND whose
|
||
/// dependencies have not changed can serve cached issues
|
||
/// instead. (Today we conservatively re‑analyse every file in
|
||
/// pass 2; caching will be refined in approach 2 / 3.)
|
||
pub fn scan_with_index_parallel(
|
||
project: &str,
|
||
pool: Arc<Pool<SqliteConnectionManager>>,
|
||
cfg: &Config,
|
||
show_progress: bool,
|
||
) -> NyxResult<Vec<Diag>> {
|
||
let files = {
|
||
let idx = Indexer::from_pool(project, &pool)?;
|
||
idx.get_files(project)?
|
||
};
|
||
|
||
let needs_taint = cfg.scanner.mode == crate::utils::config::AnalysisMode::Full
|
||
|| cfg.scanner.mode == crate::utils::config::AnalysisMode::Taint;
|
||
|
||
// ── Pass 1: ensure summaries are up‑to‑date ──────────────────────────
|
||
if needs_taint {
|
||
let _span = tracing::info_span!("pass1_indexed", files = files.len()).entered();
|
||
let pb = make_progress_bar(
|
||
files.len() as u64,
|
||
"Pass 1: Extracting summaries",
|
||
show_progress,
|
||
);
|
||
|
||
files.par_iter().for_each_init(
|
||
|| Indexer::from_pool(project, &pool).expect("db pool"),
|
||
|idx, path| {
|
||
// Read once, hash once — use the hash for the change check
|
||
// to avoid a second file read inside should_scan.
|
||
if let Ok(bytes) = std::fs::read(path) {
|
||
let hash = Indexer::digest_bytes(&bytes);
|
||
let needs_scan = idx.should_scan_with_hash(path, &hash).unwrap_or(true);
|
||
if needs_scan {
|
||
match extract_summaries_from_bytes(&bytes, path, cfg) {
|
||
Ok(sums) => {
|
||
idx.replace_summaries_for_file(path, &hash, &sums).ok();
|
||
}
|
||
Err(e) => {
|
||
tracing::warn!("pass 1: {}: {e}", path.display());
|
||
}
|
||
}
|
||
}
|
||
} else {
|
||
tracing::warn!("pass 1: cannot read {}", path.display());
|
||
}
|
||
pb.inc(1);
|
||
},
|
||
);
|
||
pb.finish_and_clear();
|
||
}
|
||
|
||
// ── Load global summaries ────────────────────────────────────────────
|
||
let global_summaries: Option<GlobalSummaries> = if needs_taint {
|
||
let _span = tracing::info_span!("load_summaries_db").entered();
|
||
let idx = Indexer::from_pool(project, &pool)?;
|
||
let all = idx.load_all_summaries()?;
|
||
tracing::info!(summaries = all.len(), "loaded cross-file summaries from DB");
|
||
Some(summary::merge_summaries(all, None))
|
||
} else {
|
||
None
|
||
};
|
||
|
||
// ── Pass 2: full analysis ────────────────────────────────────────────
|
||
let _span = tracing::info_span!("pass2_indexed").entered();
|
||
let pb2 = make_progress_bar(
|
||
files.len() as u64,
|
||
"Pass 2: Running analysis",
|
||
show_progress,
|
||
);
|
||
let diag_map: DashMap<String, Vec<Diag>> = DashMap::new();
|
||
|
||
files.into_par_iter().for_each_init(
|
||
|| Indexer::from_pool(project, &pool).expect("db pool"),
|
||
|idx, path| {
|
||
// Read file once for both change-detection and analysis.
|
||
let bytes_opt = std::fs::read(&path).ok();
|
||
let hash = bytes_opt.as_ref().map(|b| Indexer::digest_bytes(b));
|
||
|
||
// In pass 2 we always re-analyse when taint is enabled because
|
||
// global summaries may have changed even if this file didn't.
|
||
// For AST-only mode, we can still use the cached issues.
|
||
let needs_scan = if needs_taint {
|
||
true // conservative: always re-analyse in taint mode
|
||
} else {
|
||
match (&hash, &bytes_opt) {
|
||
(Some(h), _) => idx.should_scan_with_hash(&path, h).unwrap_or(true),
|
||
_ => true,
|
||
}
|
||
};
|
||
|
||
let mut diags = if needs_scan {
|
||
let d = match &bytes_opt {
|
||
Some(bytes) => {
|
||
run_rules_on_bytes(bytes, &path, cfg, global_summaries.as_ref(), None)
|
||
.unwrap_or_default()
|
||
}
|
||
None => run_rules_on_file(&path, cfg, global_summaries.as_ref(), None)
|
||
.unwrap_or_default(),
|
||
};
|
||
|
||
// Persist issues + update file record (use pre-computed hash)
|
||
let file_id = match &hash {
|
||
Some(h) => idx.upsert_file_with_hash(&path, h).unwrap_or_default(),
|
||
None => idx.upsert_file(&path).unwrap_or_default(),
|
||
};
|
||
idx.replace_issues(
|
||
file_id,
|
||
d.iter().map(|d| IssueRow {
|
||
rule_id: &d.id,
|
||
severity: d.severity.as_db_str(),
|
||
line: d.line as i64,
|
||
col: d.col as i64,
|
||
}),
|
||
)
|
||
.ok();
|
||
d
|
||
} else {
|
||
idx.get_issues_from_file(&path).unwrap_or_default()
|
||
};
|
||
|
||
match cfg.scanner.mode {
|
||
crate::utils::config::AnalysisMode::Ast => {
|
||
diags.retain(|d| !d.id.starts_with("taint") && !d.id.starts_with("cfg-"));
|
||
}
|
||
crate::utils::config::AnalysisMode::Taint => {
|
||
diags.retain(|d| d.id.starts_with("taint") || d.id.starts_with("cfg-"));
|
||
}
|
||
crate::utils::config::AnalysisMode::Full => {}
|
||
}
|
||
|
||
if !diags.is_empty() {
|
||
diag_map
|
||
.entry(path.to_string_lossy().to_string())
|
||
.or_default()
|
||
.append(&mut diags);
|
||
}
|
||
pb2.inc(1);
|
||
},
|
||
);
|
||
pb2.finish_and_clear();
|
||
|
||
let mut diags: Vec<Diag> = diag_map.into_iter().flat_map(|(_, v)| v).collect();
|
||
|
||
if let Some(max) = cfg.output.max_results {
|
||
diags.truncate(max as usize);
|
||
}
|
||
|
||
Ok(diags)
|
||
}
|
||
|
||
#[test]
|
||
fn scan_with_index_parallel_uses_existing_index_without_rescanning() {
|
||
let mut cfg = Config::default();
|
||
cfg.performance.worker_threads = Some(1);
|
||
cfg.performance.channel_multiplier = 1;
|
||
cfg.performance.batch_size = 2;
|
||
|
||
let td = tempfile::tempdir().unwrap();
|
||
let project_dir = td.path().join("proj");
|
||
std::fs::create_dir(&project_dir).unwrap();
|
||
std::fs::write(project_dir.join("foo.txt"), "abc").unwrap();
|
||
|
||
let (project_name, db_path) = get_project_info(&project_dir, td.path()).unwrap();
|
||
crate::commands::index::build_index(&project_name, &project_dir, &db_path, &cfg, false)
|
||
.unwrap();
|
||
|
||
let pool = Indexer::init(&db_path).unwrap();
|
||
|
||
assert_eq!(
|
||
Indexer::from_pool(&project_name, &pool)
|
||
.unwrap()
|
||
.get_files(&project_name)
|
||
.unwrap()
|
||
.len(),
|
||
1
|
||
);
|
||
|
||
let diags = scan_with_index_parallel(&project_name, Arc::clone(&pool), &cfg, false)
|
||
.expect("scan should succeed");
|
||
|
||
assert!(diags.is_empty());
|
||
}
|