From 80c0bc9845fa713b73236156f0382bca004e8679 Mon Sep 17 00:00:00 2001 From: elipeter Date: Mon, 23 Jun 2025 16:51:39 +0200 Subject: [PATCH] Add `max_file_size_mb` and `high_only` logic to scanning process: - Implement `max_file_size_mb` to restrict files for scanning based on size. - Refactor `high_only` handling to modify `min_severity` in `Config`. - Update `ScannerConfig` to use `Option` for optional size limits. - Remove redundant `high_only` parameter from `scan::handle` function. - Improve batch processing in `walk` for efficient file scanning. --- logs/nano-scanner.log.2025-06-16-23 | 0 src/commands/mod.rs | 7 +++++-- src/commands/scan.rs | 3 +-- src/main.rs | 4 ++-- src/utils/config.rs | 18 ++++++++--------- src/walk.rs | 30 +++++++++++++++++++++++++++++ 6 files changed, 47 insertions(+), 15 deletions(-) delete mode 100644 logs/nano-scanner.log.2025-06-16-23 diff --git a/logs/nano-scanner.log.2025-06-16-23 b/logs/nano-scanner.log.2025-06-16-23 deleted file mode 100644 index e69de29b..00000000 diff --git a/src/commands/mod.rs b/src/commands/mod.rs index c0d80f38..b18f60f3 100644 --- a/src/commands/mod.rs +++ b/src/commands/mod.rs @@ -5,16 +5,19 @@ pub mod clean; use crate::cli::Commands; use std::path::Path; +use crate::patterns::Severity; use crate::utils::config::Config; pub fn handle_command( command: Commands, database_dir: &Path, - config: &Config + config: &mut Config ) -> Result<(), Box> { match command { Commands::Scan { path, no_index, rebuild_index, format, high_only } => { - scan::handle(&path, no_index, rebuild_index, format, high_only, database_dir, config) + if high_only { config.scanner.min_severity = Severity::High }; + + scan::handle(&path, no_index, rebuild_index, format, database_dir, config) } Commands::Index { action } => { index::handle(action, database_dir, config) diff --git a/src/commands/scan.rs b/src/commands/scan.rs index bbe87480..3450e1a4 100644 --- a/src/commands/scan.rs +++ b/src/commands/scan.rs @@ -30,7 +30,6 @@ pub fn handle( no_index: bool, rebuild_index: bool, format: String, - high_only: bool, database_dir: &Path, config: &Config, ) -> Result<(), Box> { @@ -52,7 +51,7 @@ pub fn handle( if format == "console" || format == "" && config.output.default_format == "console" { for d in &diags { - if high_only && d.severity != Severity::High { + if d.severity != Severity::High { continue; } let sev_str = match d.severity { diff --git a/src/main.rs b/src/main.rs index 472a943c..562f56fa 100644 --- a/src/main.rs +++ b/src/main.rs @@ -53,9 +53,9 @@ fn main() -> Result<(), Box> { let database_dir = proj_dirs.data_local_dir(); fs::create_dir_all(database_dir)?; - let config = Config::load(config_dir)?; + let mut config = Config::load(config_dir)?; - commands::handle_command(cli.command, database_dir, &config)?; + commands::handle_command(cli.command, database_dir, &mut config)?; Ok(()) } diff --git a/src/utils/config.rs b/src/utils/config.rs index d90971d0..2418d3c2 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -10,25 +10,25 @@ pub struct ScannerConfig { /// The minimum severity level to output pub min_severity: Severity, - /// The maximum file size to scan, in megabytes. TODO: IMPLEMENT - pub max_file_size_mb: u64, + /// The maximum file size to scan, in megabytes. + pub max_file_size_mb: Option, - /// File extensions to exclude from scanning. TODO: IMPLEMENT + /// File extensions to exclude from scanning. pub excluded_extensions: Vec, - /// Directories to exclude from scanning. TODO: IMPLEMENT + /// Directories to exclude from scanning. pub excluded_directories: Vec, /// Excluded files pub excluded_files: Vec, - /// Whether to respect the global ignore file or not. TODO: IMPLEMENT + /// Whether to respect the global ignore file or not. pub read_global_ignore: bool, - /// Whether to respect VCS ignore files (`.gitignore`, ..) or not. TODO: IMPLEMENT + /// Whether to respect VCS ignore files (`.gitignore`, ..) or not. pub read_vcsignore: bool, - /// Whether to require a `.git` directory to respect gitignore files. TODO: IMPLEMENT + /// Whether to require a `.git` directory to respect gitignore files. pub require_git_to_read_vcsignore: bool, /// Whether to limit the search to starting file system or not. @@ -37,14 +37,14 @@ pub struct ScannerConfig { /// Whether to follow symlinks or not. pub follow_symlinks: bool, - /// Whether to scan hidden files or not. TODO: IMPLEMENT + /// Whether to scan hidden files or not. pub scan_hidden_files: bool, } impl Default for ScannerConfig { fn default() -> Self { Self { min_severity: Severity::Low, - max_file_size_mb: 100, + max_file_size_mb: None, excluded_extensions: vec![ "jpg", "png", "gif", "mp4", "avi", "mkv", "zip", "tar", "gz", "exe", "dll", "so", diff --git a/src/walk.rs b/src/walk.rs index fd35e66b..b0b9b737 100644 --- a/src/walk.rs +++ b/src/walk.rs @@ -63,6 +63,7 @@ pub fn spawn_senders( let root = root.to_path_buf(); let scan_hidden = cfg.scanner.scan_hidden_files; let follow_links = cfg.scanner.follow_symlinks; + let max_bytes: u64 = (cfg.scanner.max_file_size_mb.unwrap_or(0)) * 1_048_576; thread::spawn(move || { let walker = WalkBuilder::new(root) @@ -72,6 +73,8 @@ pub fn spawn_senders( .overrides(overrides) .build_parallel(); + + /* walker.run(move || { let tx = tx.clone(); let mut batch = Vec::::with_capacity(256); @@ -94,6 +97,33 @@ pub fn spawn_senders( WalkState::Continue }) }); + */ + walker.run(move || { + let mut batcher = Batcher { + tx: tx.clone(), + batch: Vec::with_capacity(BATCH_SIZE), + }; + + Box::new(move |entry| { + tracing::debug!("walking: {:?}", entry); + let e = match entry { + Ok(e) if e.file_type().map(|ft| ft.is_file()).unwrap_or(false) => e, + _ => return WalkState::Continue, + }; + if max_bytes != 0 { + match e.metadata() { + Ok(m) if m.len() <= max_bytes => {}, + _ => return WalkState::Continue, + } + } + tracing::debug!("scanning file: {:?}", e); + batcher.push(e.into_path()); + if batcher.batch.len() == BATCH_SIZE { + let _ = batcher.tx.send(std::mem::take(&mut batcher.batch)); + } + WalkState::Continue + }) + }); });