nyx/src/commands/scan.rs

211 lines
6.4 KiB
Rust
Raw Normal View History

pub(crate) use crate::ast::run_rules_on_file;
use crate::database::index::{Indexer, IssueRow};
use crate::errors::NyxResult;
use crate::patterns::Severity;
use crate::utils::config::Config;
use crate::utils::project::get_project_info;
use crate::walk::spawn_senders;
use console::style;
use dashmap::DashMap;
use r2d2::Pool;
use r2d2_sqlite::SqliteConnectionManager;
use rayon::prelude::*;
use std::collections::BTreeMap;
use std::path::Path;
use std::sync::{Arc, Mutex};
type DynError = Box<dyn std::error::Error + Send + Sync>;
#[derive(Debug)]
pub struct Diag {
pub(crate) path: String,
pub(crate) line: usize,
pub(crate) col: usize,
pub(crate) severity: Severity,
pub(crate) id: String,
}
/// Entry point called by the CLI.
pub fn handle(
path: &str,
no_index: bool,
rebuild_index: bool,
format: String,
database_dir: &Path,
config: &Config,
) -> NyxResult<()> {
let scan_path = Path::new(path).canonicalize()?;
let (project_name, db_path) = get_project_info(&scan_path, database_dir)?;
println!(
"{} {}...\n",
style("Checking").green().bold(),
&project_name
);
let diags: Vec<Diag> = if no_index {
scan_filesystem(&scan_path, config)?
} else {
if rebuild_index || !db_path.exists() {
tracing::debug!("Scanning filesystem index filesystem");
crate::commands::index::build_index(&project_name, &scan_path, &db_path, config)?;
}
let pool = Indexer::init(&db_path)?;
scan_with_index_parallel(&project_name, pool, config)?
};
tracing::debug!("Found {:?} issues.", diags.len());
if format == "console" || (format.is_empty() && config.output.default_format == "console") {
tracing::debug!("Printing to console");
let mut grouped: BTreeMap<&str, Vec<&Diag>> = BTreeMap::new();
for d in &diags {
grouped.entry(&d.path).or_default().push(d);
}
for (path, issues) in &grouped {
println!("{}", style(path).blue().underlined());
for d in issues {
println!(
Added experimental control flow analysis and syntax classification for rust lang (#22) * Introduce control flow graph (CFG) support: - Added `cfg.rs` with CFG generation and analysis utilities. - Integrated `petgraph` library for graph-based computations. - Updated `ast.rs` to utilize CFG for function analysis. - Modified `Cargo.toml` and `Cargo.lock` to include new dependencies. - Improved static analysis with taint tracking through CFG paths. * feat: enhance control flow analysis with taint tracking and node labeling * feat: improve control flow graph with enhanced node handling and new tests * Remove unnecessary reference marker in `byte_offset_to_point` comment. * Remove unnecessary reference marker in `byte_offset_to_point` comment. * Refactor `ast.rs` for performance and clarity; enhance `cfg.rs` with recursive CFG generation and improved classification logic for AST analysis. * Refactor CFG and taint tracking logic: - Enhanced `cfg.rs` with inline helper function `text_of` for cleaner UTF-8 handling in AST nodes. - Expanded `labels.rs` rules with detailed `Sources`, `Sanitizers`, and `Sinks` for improved classification. - Refined `push_node` to handle method call expressions with object-function pairing. - Simplified code handling in trivia skipping and debug-only logic. * Enhance `cfg.rs` with `first_call_ident` helper and improve identifier extraction logic in `push_node`. * Add targeted CFG taint-tracking tests to enhance analysis coverage. * Enhance CFG generation with loop expression handling and improve taint tracking logic. Add new sanitization example in `examples/sanitize/example.rs`. * Update README with installation instructions for Cargo and GitHub releases. * Expand taint-tracking with precise `def-use` computation and enhance `labels.rs` for detailed classification. Extend `examples/sanitize` with realistic scenarios demonstrating new rules. * Refactor `labels.rs`: - Removed redundant `LabelRule` entries for cleaner rule definitions. - Adjusted matching logic to prioritize suffix and prefix matches effectively. * Refactor `labels.rs`: - Removed redundant `LabelRule` entries for cleaner rule definitions. - Adjusted matching logic to prioritize suffix and prefix matches effectively. * Add test for taint tracking with multiple sources in `cfg.rs`. * Add `function_summaries` table and implement summary upsert/load methods. Refactor to handle summary storage and retrieval efficiently, with placeholder clean/drop logic. * refactor: split `labels.rs` into modular structure with language-specific files * refactor: split `labels.rs` into modular structure with language-specific files * refactor: clean up SQL table definitions in `database.rs` for better readability * refactor: simplify CFG structure by removing lifetime parameters and enhancing taint metadata handling * refactor: update TODO comments in `cfg.rs` to clarify future enhancements for cap labels and function details * refactor: remove redundant header from README.md for improved clarity * feat: add PHF-based syntax classifiers and Kind enum for efficient syntax mapping across languages * feat: introduce analysis modes for enhanced scanner configuration and diagnostics * feat: define Kind enum for syntax classification in control flow analysis * feat: bump version to 0.2.0-alpha and update CHANGELOG for new features and fixes * refactor: clean up imports and formatting in AST and CFG modules for improved readability * refactor: simplify function signatures and improve code readability in CFG and module files * fix: correct rayon_thread_stack_size comment to reflect actual value of 8 MiB * refactor: update string formatting in clean and project modules for consistency * refactor: fix indentation in clean.rs for improved readability --------- Co-authored-by: elipeter <eli.peter@es.fcm.travel>
2025-06-28 17:36:14 +02:00
" {:>4}:{:<4} [{:}] {:}",
d.line,
d.col,
d.severity,
style(&d.id).bold()
);
}
println!();
}
println!(
"{} '{}' generated {} issues.",
style("warning").yellow().bold(),
style(project_name).white().bold(),
style(diags.len()).bold()
);
println!("\t"); // TODO: Add individual counts for different warning levels
}
Ok(())
}
// --------------------------------------------------------------------------------------------
// Scanning helpers
// --------------------------------------------------------------------------------------------
fn scan_filesystem(root: &Path, cfg: &Config) -> NyxResult<Vec<Diag>> {
let rx = spawn_senders(root, cfg);
let acc = Mutex::new(Vec::new());
rx.into_iter().flatten().par_bridge().try_for_each(|path| {
let mut local = run_rules_on_file(&path, cfg)?;
acc.lock().unwrap().append(&mut local);
Ok::<(), DynError>(())
})?;
let mut diags = acc.into_inner()?;
if let Some(max) = cfg.output.max_results {
diags.truncate(max as usize);
}
Ok(diags)
}
pub fn scan_with_index_parallel(
project: &str,
pool: Arc<Pool<SqliteConnectionManager>>,
cfg: &Config,
) -> NyxResult<Vec<Diag>> {
let files = {
let idx = Indexer::from_pool(project, &pool)?;
idx.get_files(project)?
};
let diag_map: DashMap<String, Vec<Diag>> = DashMap::new();
files.into_par_iter().for_each_init(
|| Indexer::from_pool(project, &pool).expect("db pool"),
|idx, path| {
let needs_scan = idx.should_scan(&path).unwrap_or(true);
let mut diags = if needs_scan {
let d = run_rules_on_file(&path, cfg).unwrap_or_default();
let file_id = idx.upsert_file(&path).unwrap_or_default();
idx.replace_issues(
file_id,
d.iter().map(|d| IssueRow {
rule_id: &d.id,
severity: d.severity.as_db_str(),
line: d.line as i64,
col: d.col as i64,
}),
)
.ok();
d
} else {
idx.get_issues_from_file(&path).unwrap_or_default()
};
Added experimental control flow analysis and syntax classification for rust lang (#22) * Introduce control flow graph (CFG) support: - Added `cfg.rs` with CFG generation and analysis utilities. - Integrated `petgraph` library for graph-based computations. - Updated `ast.rs` to utilize CFG for function analysis. - Modified `Cargo.toml` and `Cargo.lock` to include new dependencies. - Improved static analysis with taint tracking through CFG paths. * feat: enhance control flow analysis with taint tracking and node labeling * feat: improve control flow graph with enhanced node handling and new tests * Remove unnecessary reference marker in `byte_offset_to_point` comment. * Remove unnecessary reference marker in `byte_offset_to_point` comment. * Refactor `ast.rs` for performance and clarity; enhance `cfg.rs` with recursive CFG generation and improved classification logic for AST analysis. * Refactor CFG and taint tracking logic: - Enhanced `cfg.rs` with inline helper function `text_of` for cleaner UTF-8 handling in AST nodes. - Expanded `labels.rs` rules with detailed `Sources`, `Sanitizers`, and `Sinks` for improved classification. - Refined `push_node` to handle method call expressions with object-function pairing. - Simplified code handling in trivia skipping and debug-only logic. * Enhance `cfg.rs` with `first_call_ident` helper and improve identifier extraction logic in `push_node`. * Add targeted CFG taint-tracking tests to enhance analysis coverage. * Enhance CFG generation with loop expression handling and improve taint tracking logic. Add new sanitization example in `examples/sanitize/example.rs`. * Update README with installation instructions for Cargo and GitHub releases. * Expand taint-tracking with precise `def-use` computation and enhance `labels.rs` for detailed classification. Extend `examples/sanitize` with realistic scenarios demonstrating new rules. * Refactor `labels.rs`: - Removed redundant `LabelRule` entries for cleaner rule definitions. - Adjusted matching logic to prioritize suffix and prefix matches effectively. * Refactor `labels.rs`: - Removed redundant `LabelRule` entries for cleaner rule definitions. - Adjusted matching logic to prioritize suffix and prefix matches effectively. * Add test for taint tracking with multiple sources in `cfg.rs`. * Add `function_summaries` table and implement summary upsert/load methods. Refactor to handle summary storage and retrieval efficiently, with placeholder clean/drop logic. * refactor: split `labels.rs` into modular structure with language-specific files * refactor: split `labels.rs` into modular structure with language-specific files * refactor: clean up SQL table definitions in `database.rs` for better readability * refactor: simplify CFG structure by removing lifetime parameters and enhancing taint metadata handling * refactor: update TODO comments in `cfg.rs` to clarify future enhancements for cap labels and function details * refactor: remove redundant header from README.md for improved clarity * feat: add PHF-based syntax classifiers and Kind enum for efficient syntax mapping across languages * feat: introduce analysis modes for enhanced scanner configuration and diagnostics * feat: define Kind enum for syntax classification in control flow analysis * feat: bump version to 0.2.0-alpha and update CHANGELOG for new features and fixes * refactor: clean up imports and formatting in AST and CFG modules for improved readability * refactor: simplify function signatures and improve code readability in CFG and module files * fix: correct rayon_thread_stack_size comment to reflect actual value of 8 MiB * refactor: update string formatting in clean and project modules for consistency * refactor: fix indentation in clean.rs for improved readability --------- Co-authored-by: elipeter <eli.peter@es.fcm.travel>
2025-06-28 17:36:14 +02:00
match cfg.scanner.mode {
crate::utils::config::AnalysisMode::Ast => {
diags.retain(|d| !d.id.starts_with("taint"));
}
crate::utils::config::AnalysisMode::Taint => {
diags.retain(|d| d.id.starts_with("taint"));
}
crate::utils::config::AnalysisMode::Full => {}
}
if !diags.is_empty() {
diag_map
.entry(path.to_string_lossy().to_string())
.or_default()
.append(&mut diags);
}
},
);
// Optional, heavy: only vacuum on --rebuild-index
// if rebuild { idx.vacuum()?; }
let mut diags: Vec<Diag> = diag_map.into_iter().flat_map(|(_, v)| v).collect();
if let Some(max) = cfg.output.max_results {
diags.truncate(max as usize);
}
Ok(diags)
}
#[test]
fn scan_with_index_parallel_uses_existing_index_without_rescanning() {
let mut cfg = Config::default();
cfg.performance.worker_threads = Some(1);
cfg.performance.channel_multiplier = 1;
cfg.performance.batch_size = 2;
let td = tempfile::tempdir().unwrap();
let project_dir = td.path().join("proj");
std::fs::create_dir(&project_dir).unwrap();
std::fs::write(project_dir.join("foo.txt"), "abc").unwrap();
let (project_name, db_path) = get_project_info(&project_dir, td.path()).unwrap();
crate::commands::index::build_index(&project_name, &project_dir, &db_path, &cfg).unwrap();
let pool = Indexer::init(&db_path).unwrap();
assert_eq!(
Indexer::from_pool(&project_name, &pool)
.unwrap()
.get_files(&project_name)
.unwrap()
.len(),
1
);
let diags = scan_with_index_parallel(&project_name, Arc::clone(&pool), &cfg)
.expect("scan should succeed");
assert!(diags.is_empty());
}