From ada7835efa57bd0354c9cd788f27a48fdc6ec7dd Mon Sep 17 00:00:00 2001 From: elipeter Date: Mon, 16 Jun 2025 23:47:50 +0200 Subject: [PATCH] - Remove unused `filetypes.rs` and `walk.rs` modules - Introduce `index.rs` for file indexing using SQLite - Expand configuration options in `config.rs`, including `excluded_files` - Update dependencies in `Cargo.toml` to include SQLite, hashing, and regex libraries --- Cargo.lock | 323 ++++++++++++++++-- Cargo.toml | 14 +- README.md | 4 + src/commands/scan.rs | 70 +++- src/exit_codes.rs | 0 src/filetypes.rs | 43 --- src/index.rs | 77 +++++ src/main.rs | 6 +- src/utils/config.rs | 13 +- src/utils/mod.rs | 2 +- src/walk.rs | 765 ++++++------------------------------------- 11 files changed, 551 insertions(+), 766 deletions(-) create mode 100644 README.md delete mode 100644 src/exit_codes.rs delete mode 100644 src/filetypes.rs create mode 100644 src/index.rs diff --git a/Cargo.lock b/Cargo.lock index 667d9ebb..47b5bb14 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6,14 +6,22 @@ version = 4 name = "Nano" version = "0.1.0" dependencies = [ + "anyhow", + "blake3", "clap", + "crossbeam-channel", "directories", + "ignore", + "nix", + "num_cpus", + "regex", + "rusqlite", "serde", - "time", "toml", "tracing", - "tracing-appender", "tracing-subscriber", + "tree-sitter", + "tree-sitter-rust", ] [[package]] @@ -75,18 +83,74 @@ dependencies = [ "windows-sys", ] +[[package]] +name = "anyhow" +version = "1.0.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" + +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + [[package]] name = "bitflags" version = "2.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" +[[package]] +name = "blake3" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3888aaa89e4b2a40fca9848e400f6a658a5a3978de7be858e209cafa8be9a4a0" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", +] + +[[package]] +name = "bstr" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234113d19d0d7d613b40e86fb654acf958910802bcceab913a4f9e7cda03b1a4" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "cc" +version = "1.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d487aa071b5f64da6f19a3e848e3578944b726ee5a4854b82172f02aa876bfdc" +dependencies = [ + "shlex", +] + [[package]] name = "cfg-if" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + [[package]] name = "clap" version = "4.5.40" @@ -133,6 +197,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + [[package]] name = "crossbeam-channel" version = "0.5.15" @@ -142,6 +212,25 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "crossbeam-utils" version = "0.8.21" @@ -184,6 +273,24 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" +[[package]] +name = "fallible-iterator" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + [[package]] name = "getrandom" version = "0.2.16" @@ -195,11 +302,36 @@ dependencies = [ "wasi", ] +[[package]] +name = "globset" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54a1028dfc5f5df5da8a56a73e6c153c9a9708ec57232470703592a3f18e49f5" +dependencies = [ + "aho-corasick", + "bstr", + "log", + "regex-automata 0.4.9", + "regex-syntax 0.8.5", +] + [[package]] name = "hashbrown" version = "0.15.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5" +dependencies = [ + "foldhash", +] + +[[package]] +name = "hashlink" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7382cf6263419f2d8df38c55d7da83da5c18aef87fc7a7fc1fb1e344edfe14c1" +dependencies = [ + "hashbrown", +] [[package]] name = "heck" @@ -207,6 +339,28 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] +name = "ignore" +version = "0.4.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d89fd380afde86567dfba715db065673989d6253f42b88179abd3eae47bda4b" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata 0.4.9", + "same-file", + "walkdir", + "winapi-util", +] + [[package]] name = "indexmap" version = "2.9.0" @@ -251,6 +405,16 @@ dependencies = [ "libc", ] +[[package]] +name = "libsqlite3-sys" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91632f3b4fb6bd1d72aa3d78f41ffecfcf2b1a6648d8c241dbe7dbfaf4875e15" +dependencies = [ + "pkg-config", + "vcpkg", +] + [[package]] name = "log" version = "0.4.27" @@ -272,6 +436,18 @@ version = "2.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" +[[package]] +name = "nix" +version = "0.30.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74523f3a35e05aba87a1d978330aef40f67b0304ac79c1c00b294c9830543db6" +dependencies = [ + "bitflags", + "cfg-if", + "cfg_aliases", + "libc", +] + [[package]] name = "nu-ansi-term" version = "0.46.0" @@ -288,6 +464,16 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", +] + [[package]] name = "once_cell" version = "1.21.3" @@ -318,6 +504,12 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + [[package]] name = "powerfmt" version = "0.2.0" @@ -350,7 +542,7 @@ checksum = "dd6f9d3d47bdd2ad6945c5015a226ec6155d0bcdfd8f7cd29f86b71f8de99d2b" dependencies = [ "getrandom", "libredox", - "thiserror 2.0.12", + "thiserror", ] [[package]] @@ -397,12 +589,35 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +[[package]] +name = "rusqlite" +version = "0.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3de23c3319433716cf134eed225fe9986bc24f63bed9be9f20c329029e672dc7" +dependencies = [ + "bitflags", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "smallvec", +] + [[package]] name = "ryu" version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "serde" version = "1.0.219" @@ -429,6 +644,7 @@ version = "1.0.140" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" dependencies = [ + "indexmap", "itoa", "memchr", "ryu", @@ -453,12 +669,24 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "smallvec" version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" +[[package]] +name = "streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" + [[package]] name = "strsim" version = "0.11.1" @@ -476,33 +704,13 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "thiserror" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" -dependencies = [ - "thiserror-impl 1.0.69", -] - [[package]] name = "thiserror" version = "2.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" dependencies = [ - "thiserror-impl 2.0.12", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" -dependencies = [ - "proc-macro2", - "quote", - "syn", + "thiserror-impl", ] [[package]] @@ -608,18 +816,6 @@ dependencies = [ "tracing-core", ] -[[package]] -name = "tracing-appender" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3566e8ce28cc0a3fe42519fc80e6b4c943cc4c8cef275620eb8dac2d3d4e06cf" -dependencies = [ - "crossbeam-channel", - "thiserror 1.0.69", - "time", - "tracing-subscriber", -] - [[package]] name = "tracing-attributes" version = "0.1.29" @@ -684,6 +880,36 @@ dependencies = [ "tracing-serde", ] +[[package]] +name = "tree-sitter" +version = "0.25.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7cf18d43cbf0bfca51f657132cc616a5097edc4424d538bae6fa60142eaf9f0" +dependencies = [ + "cc", + "regex", + "regex-syntax 0.8.5", + "serde_json", + "streaming-iterator", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-language" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4013970217383f67b18aef68f6fb2e8d409bc5755227092d32efb0422ba24b8" + +[[package]] +name = "tree-sitter-rust" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b9b18034c684a2420722be8b2a91c9c44f2546b631c039edf575ccba8c61be1" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "unicode-ident" version = "1.0.18" @@ -702,6 +928,22 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -724,6 +966,15 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys", +] + [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" diff --git a/Cargo.toml b/Cargo.toml index 6b6ad1bd..3b53f804 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,6 +9,16 @@ clap = { version = "4.5.40", features = ["derive"] } serde = { version = "1.0.219", features = ["derive"] } toml = "0.8.23" tracing-subscriber = { version = "0.3.19", features = ["env-filter", "json", "ansi","time"] } -tracing-appender = "0.2.3" tracing = "0.1.41" -time = "0.3.41" +num_cpus = "1.17.0" + +rusqlite = "0.36.0" +anyhow = "1.0.98" + +ignore = "0.4.23" +tree-sitter = "0.25.6" +tree-sitter-rust = "0.24.0" +crossbeam-channel = "0.5.15" +regex = "1.11.1" +blake3 = "1.8.2" +nix = { version = "0.30.1", features = ["signal"] } diff --git a/README.md b/README.md new file mode 100644 index 00000000..f57ac5a2 --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +## Credit + +Portions of `walk.rs` are adapted from [fd](https://github.com/sharkdp/fd) by the fd developers (MIT License). +See [`walk.rs`](src/walk.rs) for modification details. diff --git a/src/commands/scan.rs b/src/commands/scan.rs index 44e27122..a5fa4e02 100644 --- a/src/commands/scan.rs +++ b/src/commands/scan.rs @@ -2,6 +2,9 @@ use crate::cli::OutputFormat; use crate::utils::project::get_project_info; use std::path::Path; use crate::utils::config::Config; +use tree_sitter::{Parser}; +use crate::index::index::Indexer; +use crate::walk::spawn_senders; pub fn handle( path: &str, @@ -15,13 +18,13 @@ pub fn handle( let scan_path = Path::new(path).canonicalize()?; let (project_name, db_path) = get_project_info(&scan_path, database_dir)?; - tracing::info!("Config: {:?}", config); + tracing::debug!("Config: {:?}", config); tracing::info!("Scanning project: {}", project_name); tracing::info!("Scan path: {}", scan_path.display()); if no_index { tracing::info!("Scanning without index..."); - scan_filesystem(&scan_path)?; + scan_filesystem(&scan_path, config)?; } else { if rebuild_index || !db_path.exists() { tracing::info!("Building/updating index..."); @@ -29,7 +32,7 @@ pub fn handle( } tracing::info!("Using index: {}", db_path.display()); - scan_with_index(&db_path)?; + scan_with_index(&scan_path, &db_path, config)?; } tracing::info!("Output format: {:?}", format); @@ -40,14 +43,63 @@ pub fn handle( Ok(()) } -fn scan_filesystem(path: &Path) -> Result<(), Box> { - // TODO: Implement direct filesystem scanning - tracing::info!("Direct filesystem scan of: {}", path.display()); +fn scan_filesystem(root: &Path, cfg: &Config) -> Result<(), Box> { + let rx = spawn_senders(root, cfg); + + for batch in rx.iter().flatten() { + tracing::debug!("Scanning file: {}", batch.display()); + scan_single_file(&batch, cfg)?; // <-- your actual scanner + } + Ok(()) +} +fn scan_with_index(root: &Path, db_path: &Path, cfg: &Config) -> Result<(), Box> { + let indexer = Indexer::new(db_path) + .map_err(|e| format!("opening index {}: {e}", db_path.display()))?; + + let rx = spawn_senders(root, cfg); + + for batch in rx.iter().flatten() { + let scan = indexer.should_scan(&batch)?; + tracing::debug!("Should scan: {}, file: {}", scan, batch.display()); + if scan { + tracing::debug!("Scanning file: {}", batch.display()); + scan_single_file(&batch, cfg)?; // your scanner + indexer.record_scan(&batch)?; + } + } Ok(()) } -fn scan_with_index(db_path: &Path) -> Result<(), Box> { - // TODO: Implement index-based scanning - tracing::info!("Index-based scan using: {}", db_path.display()); +fn scan_single_file( + path: &Path, + _cfg: &Config, +) -> Result<(), Box> { + if path.extension().and_then(|s| s.to_str()) != Some("rs") { + return Ok(()); + } + + let source = std::fs::read_to_string(path)?; + + let mut parser = Parser::new(); + parser.set_language(&tree_sitter_rust::LANGUAGE.into())?; + + let tree = parser.parse(&source, None).ok_or("tree-sitter failed")?; + let root = tree.root_node(); + + let mut fn_count = 0; + let mut cursor = root.walk(); + for child in root.children(&mut cursor) { + if child.kind() == "function_item" { + fn_count += 1; + } + } + + tracing::info!( + "scanned {} – found {} Rust function(s)", + path.display(), + fn_count + ); + + // TODO: real vulnerability/pattern checks go here Ok(()) } \ No newline at end of file diff --git a/src/exit_codes.rs b/src/exit_codes.rs deleted file mode 100644 index e69de29b..00000000 diff --git a/src/filetypes.rs b/src/filetypes.rs deleted file mode 100644 index a4b129db..00000000 --- a/src/filetypes.rs +++ /dev/null @@ -1,43 +0,0 @@ -// use crate::dir_entry; -// use crate::filesystem; -// -// use faccess::PathExt; -// -// /// Whether or not to show -// #[derive(Default)] -// pub struct FileTypes { -// pub files: bool, -// pub directories: bool, -// pub symlinks: bool, -// pub block_devices: bool, -// pub char_devices: bool, -// pub sockets: bool, -// pub pipes: bool, -// pub executables_only: bool, -// pub empty_only: bool, -// } -// -// impl FileTypes { -// pub fn should_ignore(&self, entry: &dir_entry::DirEntry) -> bool { -// if let Some(ref entry_type) = entry.file_type() { -// (!self.files && entry_type.is_file()) -// || (!self.directories && entry_type.is_dir()) -// || (!self.symlinks && entry_type.is_symlink()) -// || (!self.block_devices && filesystem::is_block_device(*entry_type)) -// || (!self.char_devices && filesystem::is_char_device(*entry_type)) -// || (!self.sockets && filesystem::is_socket(*entry_type)) -// || (!self.pipes && filesystem::is_pipe(*entry_type)) -// || (self.executables_only && !entry.path().executable()) -// || (self.empty_only && !filesystem::is_empty(entry)) -// || !(entry_type.is_file() -// || entry_type.is_dir() -// || entry_type.is_symlink() -// || filesystem::is_block_device(*entry_type) -// || filesystem::is_char_device(*entry_type) -// || filesystem::is_socket(*entry_type) -// || filesystem::is_pipe(*entry_type)) -// } else { -// true -// } -// } -// } \ No newline at end of file diff --git a/src/index.rs b/src/index.rs new file mode 100644 index 00000000..f925d936 --- /dev/null +++ b/src/index.rs @@ -0,0 +1,77 @@ +pub mod index { + use blake3::Hasher; + use rusqlite::{params, Connection, OptionalExtension}; + use std::fs; + use std::path::Path; + use std::time::{SystemTime, UNIX_EPOCH}; + + /// Schema: stores digest, file modification time (secs since epoch) and + /// last time we *fully* scanned the file. + const SCHEMA: &str = r#" + CREATE TABLE IF NOT EXISTS files ( + path TEXT PRIMARY KEY, + hash BLOB NOT NULL, + mtime INTEGER NOT NULL, + scanned_at INTEGER NOT NULL + );"#; + + pub(crate) struct Indexer { + conn: Connection, + } + + impl Indexer { + pub fn new(database_path: &Path) -> Result> { + let conn = Connection::open(database_path)?; + conn.execute_batch(SCHEMA)?; + Ok(Self { conn }) + } + + /// Returns `true` if the caller should analyze the file, i.e., we have + /// never seen it or something changed (mtime or content hash). + pub fn should_scan(&self, path: &Path) -> Result> { + let meta = fs::metadata(path)?; + let mtime = meta.modified()?.duration_since(UNIX_EPOCH)?.as_secs() as i64; + + let digest = Self::digest_file(path)?; + + let row: Option<(Vec, i64)> = self + .conn + .query_row( + "SELECT hash, mtime FROM files WHERE path = ?1", + params![path.to_string_lossy()], + |r| Ok((r.get(0)?, r.get(1)?)), + ) + .optional()?; + + match row { + Some((stored_hash, stored_mtime)) => { + Ok(stored_hash != digest || stored_mtime != mtime) + } + None => Ok(true), + } + } + + /// Persist a fresh scan result. + pub fn record_scan(&self, path: &Path) -> Result<(), Box> { + let meta = fs::metadata(path)?; + let mtime = meta.modified()?.duration_since(UNIX_EPOCH)?.as_secs() as i64; + let scanned_at = SystemTime::now() + .duration_since(UNIX_EPOCH)? + .as_secs() as i64; + let digest = Self::digest_file(path)?; + + self.conn.execute( + "REPLACE INTO files (path, hash, mtime, scanned_at) VALUES (?1, ?2, ?3, ?4)", + params![path.to_string_lossy(), digest, mtime, scanned_at], + )?; + Ok(()) + } + + fn digest_file(path: &Path) -> Result, Box> { + let mut hasher = Hasher::new(); + let mut file = fs::File::open(path)?; + std::io::copy(&mut file, &mut hasher)?; + Ok(hasher.finalize().as_bytes().to_vec()) + } + } +} \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index 7473334e..dbe0276e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,8 @@ mod cli; mod commands; mod utils; +mod walk; +mod index; use crate::utils::Config; use cli::Cli; @@ -30,9 +32,9 @@ fn init_tracing() { // .json(); Registry::default() - .with(EnvFilter::from_default_env()) // obey RUST_LOG + .with(EnvFilter::from_default_env()) .with(fmt_layer) - .init(); // install as the global subscriber + .init(); } fn main() -> Result<(), Box> { diff --git a/src/utils/config.rs b/src/utils/config.rs index 32a97b2f..f8c92e87 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -14,6 +14,9 @@ pub struct ScannerConfig { /// Directories to exclude from scanning. TODO: IMPLEMENT pub excluded_directories: Vec, + + /// Excluded files + pub excluded_files: Vec, /// Whether to respect the global ignore file or not. TODO: IMPLEMENT pub read_global_ignore: bool, @@ -24,10 +27,10 @@ pub struct ScannerConfig { /// Whether to require a `.git` directory to respect gitignore files. TODO: IMPLEMENT pub require_git_to_read_vcsignore: bool, - /// Whether to limit the search to starting file system or not. TODO: IMPLEMENT + /// Whether to limit the search to starting file system or not. pub one_file_system: bool, - /// Whether to follow symlinks or not. TODO: IMPLEMENT + /// Whether to follow symlinks or not. pub follow_symlinks: bool, /// Whether to scan hidden files or not. TODO: IMPLEMENT @@ -50,6 +53,10 @@ impl Default for ScannerConfig { .into_iter() .map(str::to_owned) .collect(), + excluded_files: vec![] + .into_iter() + .map(str::to_owned) + .collect(), read_global_ignore: false, read_vcsignore: true, require_git_to_read_vcsignore: true, @@ -125,7 +132,7 @@ pub struct PerformanceConfig { pub prune: bool, // TODO: IMPLEMENT /// The maximum number of worker threads to use., or `None` to auto-detect. - pub worker_threads: Option, // TODO: IMPLEMENT + pub worker_threads: Option, // TODO: IMPLEMENT /// The maximum number of entries to index in a single chunk. pub index_chunk_size: u32, // TODO: IMPLEMENT diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 0dfa31d0..48b89867 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -2,5 +2,5 @@ pub mod project; pub mod config; // Re-export commonly used functions for convenience -pub use project::{get_project_info, sanitize_project_name}; +pub use project::{get_project_info}; pub use config::Config; \ No newline at end of file diff --git a/src/walk.rs b/src/walk.rs index 190777f2..95707b56 100644 --- a/src/walk.rs +++ b/src/walk.rs @@ -1,670 +1,95 @@ -// use std::borrow::Cow; -// use std::ffi::OsStr; -// use std::io::{self, Write}; -// use std::mem; -// use std::path::PathBuf; -// use std::sync::atomic::{AtomicBool, Ordering}; -// use std::sync::{Arc, Mutex, MutexGuard}; -// use std::thread; -// use std::time::{Duration, Instant}; -// -// use anyhow::{anyhow, Result}; -// use crossbeam_channel::{bounded, Receiver, RecvTimeoutError, SendError, Sender}; -// use etcetera::BaseStrategy; -// use ignore::overrides::{Override, OverrideBuilder}; -// use ignore::{WalkBuilder, WalkParallel, WalkState}; -// use regex::bytes::Regex; -// -// use crate::config::Config; -// use crate::dir_entry::DirEntry; -// use crate::error::print_error; -// use crate::exec; -// use crate::exit_codes::{merge_exitcodes, ExitCode}; -// use crate::filesystem; -// use crate::output; -// -// /// The receiver thread can either be buffering results or directly streaming to the console. -// #[derive(PartialEq)] -// enum ReceiverMode { -// /// Receiver is still buffering in order to sort the results, if the search finishes fast -// /// enough. -// Buffering, -// -// /// Receiver is directly printing results to the output. -// Streaming, -// } -// -// /// The Worker threads can result in a valid entry having PathBuf or an error. -// #[allow(clippy::large_enum_variant)] -// #[derive(Debug)] -// pub enum WorkerResult { -// // Errors should be rare, so it's probably better to allow large_enum_variant than -// // to box the Entry variant -// Entry(ignore::DirEntry), // TODO: CHECK IF ERRORS -// Error(ignore::Error), -// } -// -// /// A batch of WorkerResults to send over a channel. -// #[derive(Clone)] -// struct Batch { -// items: Arc>>>, -// } -// -// impl Batch { -// fn new() -> Self { -// Self { -// items: Arc::new(Mutex::new(Some(vec![]))), -// } -// } -// -// fn lock(&self) -> MutexGuard<'_, Option>> { -// self.items.lock().unwrap() -// } -// } -// -// impl IntoIterator for Batch { -// type Item = WorkerResult; -// type IntoIter = std::vec::IntoIter; -// -// fn into_iter(self) -> Self::IntoIter { -// self.lock().take().unwrap().into_iter() -// } -// } -// -// /// Wrapper that sends batches of items at once over a channel. -// struct BatchSender { -// batch: Batch, -// tx: Sender, -// limit: usize, -// } -// -// impl BatchSender { -// fn new(tx: Sender, limit: usize) -> Self { -// Self { -// batch: Batch::new(), -// tx, -// limit, -// } -// } -// -// /// Check if we need to flush a batch. -// fn needs_flush(&self, batch: Option<&Vec>) -> bool { -// match batch { -// // Limit the batch size to provide some backpressure -// Some(vec) => vec.len() >= self.limit, -// // Batch was already taken by the receiver, so make a new one -// None => true, -// } -// } -// -// /// Add an item to a batch. -// fn send(&mut self, item: WorkerResult) -> Result<(), SendError<()>> { -// let mut batch = self.batch.lock(); -// -// if self.needs_flush(batch.as_ref()) { -// drop(batch); -// self.batch = Batch::new(); -// batch = self.batch.lock(); -// } -// -// let items = batch.as_mut().unwrap(); -// items.push(item); -// -// if items.len() == 1 { -// // New batch, send it over the channel -// self.tx -// .send(self.batch.clone()) -// .map_err(|_| SendError(()))?; -// } -// -// Ok(()) -// } -// } -// -// /// Maximum size of the output buffer before flushing results to the console -// const MAX_BUFFER_LENGTH: usize = 1000; -// /// Default duration until output buffering switches to streaming. -// const DEFAULT_MAX_BUFFER_TIME: Duration = Duration::from_millis(100); -// -// /// Wrapper for the receiver thread's buffering behavior. -// struct ReceiverBuffer<'a, W> { -// /// The configuration. -// config: &'a Config, -// /// For shutting down the senders. -// quit_flag: &'a AtomicBool, -// /// The ^C notifier. -// interrupt_flag: &'a AtomicBool, -// /// Receiver for worker results. -// rx: Receiver, -// /// Standard output. -// stdout: W, -// /// The current buffer mode. -// mode: ReceiverMode, -// /// The deadline to switch to streaming mode. -// deadline: Instant, -// /// The buffer of quickly received paths. -// buffer: Vec, -// /// Result count. -// num_results: usize, -// } -// -// impl<'a, W: Write> ReceiverBuffer<'a, W> { -// /// Create a new receiver buffer. -// fn new(state: &'a WorkerState, rx: Receiver, stdout: W) -> Self { -// let config = &state.config; -// let quit_flag = state.quit_flag.as_ref(); -// let interrupt_flag = state.interrupt_flag.as_ref(); -// let max_buffer_time = config.max_buffer_time.unwrap_or(DEFAULT_MAX_BUFFER_TIME); -// let deadline = Instant::now() + max_buffer_time; -// -// Self { -// config, -// quit_flag, -// interrupt_flag, -// rx, -// stdout, -// mode: ReceiverMode::Buffering, -// deadline, -// buffer: Vec::with_capacity(MAX_BUFFER_LENGTH), -// num_results: 0, -// } -// } -// -// /// Process results until finished. -// fn process(&mut self) -> ExitCode { -// loop { -// if let Err(ec) = self.poll() { -// self.quit_flag.store(true, Ordering::Relaxed); -// return ec; -// } -// } -// } -// -// /// Receive the next worker result. -// fn recv(&self) -> Result { -// match self.mode { -// ReceiverMode::Buffering => { -// // Wait at most until we should switch to streaming -// self.rx.recv_deadline(self.deadline) -// } -// ReceiverMode::Streaming => { -// // Wait however long it takes for a result -// Ok(self.rx.recv()?) -// } -// } -// } -// -// /// Wait for a result or state change. -// fn poll(&mut self) -> Result<(), ExitCode> { -// match self.recv() { -// Ok(batch) => { -// for result in batch { -// match result { -// WorkerResult::Entry(dir_entry) => { -// if self.config.quiet { -// return Err(ExitCode::HasResults(true)); -// } -// -// match self.mode { -// ReceiverMode::Buffering => { -// self.buffer.push(dir_entry); -// if self.buffer.len() > MAX_BUFFER_LENGTH { -// self.stream()?; -// } -// } -// ReceiverMode::Streaming => { -// self.print(&dir_entry)?; -// } -// } -// -// self.num_results += 1; -// if let Some(max_results) = self.config.max_results { -// if self.num_results >= max_results { -// return self.stop(); -// } -// } -// } -// WorkerResult::Error(err) => { -// if self.config.show_filesystem_errors { -// print_error(err.to_string()); -// } -// } -// } -// } -// -// // If we don't have another batch ready, flush before waiting -// if self.mode == ReceiverMode::Streaming && self.rx.is_empty() { -// self.flush()?; -// } -// } -// Err(RecvTimeoutError::Timeout) => { -// self.stream()?; -// } -// Err(RecvTimeoutError::Disconnected) => { -// return self.stop(); -// } -// } -// -// Ok(()) -// } -// -// /// Output a path. -// fn print(&mut self, entry: &DirEntry) -> Result<(), ExitCode> { -// if let Err(e) = output::print_entry(&mut self.stdout, entry, self.config) { -// if e.kind() != ::std::io::ErrorKind::BrokenPipe { -// print_error(format!("Could not write to output: {e}")); -// return Err(ExitCode::GeneralError); -// } -// } -// -// if self.interrupt_flag.load(Ordering::Relaxed) { -// // Ignore any errors on flush, because we're about to exit anyway -// let _ = self.flush(); -// return Err(ExitCode::KilledBySigint); -// } -// -// Ok(()) -// } -// -// /// Switch ourselves into streaming mode. -// fn stream(&mut self) -> Result<(), ExitCode> { -// self.mode = ReceiverMode::Streaming; -// -// let buffer = mem::take(&mut self.buffer); -// for path in buffer { -// self.print(&path)?; -// } -// -// self.flush() -// } -// -// /// Stop looping. -// fn stop(&mut self) -> Result<(), ExitCode> { -// if self.mode == ReceiverMode::Buffering { -// self.buffer.sort(); -// self.stream()?; -// } -// -// if self.config.quiet { -// Err(ExitCode::HasResults(self.num_results > 0)) -// } else { -// Err(ExitCode::Success) -// } -// } -// -// /// Flush stdout if necessary. -// fn flush(&mut self) -> Result<(), ExitCode> { -// if self.stdout.flush().is_err() { -// // Probably a broken pipe. Exit gracefully. -// return Err(ExitCode::GeneralError); -// } -// Ok(()) -// } -// } -// -// /// State shared by the sender and receiver threads. -// struct WorkerState { -// /// The search patterns. -// patterns: Vec, -// /// The command line configuration. -// config: Config, -// /// Flag for cleanly shutting down the parallel walk -// quit_flag: Arc, -// /// Flag specifically for quitting due to ^C -// interrupt_flag: Arc, -// } -// -// impl WorkerState { -// fn new(patterns: Vec, config: Config) -> Self { -// let quit_flag = Arc::new(AtomicBool::new(false)); -// let interrupt_flag = Arc::new(AtomicBool::new(false)); -// -// Self { -// patterns, -// config, -// quit_flag, -// interrupt_flag, -// } -// } -// -// fn build_overrides(&self, paths: &[PathBuf]) -> Result { -// let first_path = &paths[0]; -// let config = &self.config; -// -// let mut builder = OverrideBuilder::new(first_path); -// -// for pattern in &config.exclude_patterns { -// builder -// .add(pattern) -// .map_err(|e| anyhow!("Malformed exclude pattern: {}", e))?; -// } -// -// builder -// .build() -// .map_err(|_| anyhow!("Mismatch in exclude patterns")) -// } -// -// fn build_walker(&self, paths: &[PathBuf]) -> Result { -// let first_path = &paths[0]; -// let config = &self.config; -// let overrides = self.build_overrides(paths)?; -// -// let mut builder = WalkBuilder::new(first_path); -// builder -// .hidden(config.ignore_hidden) -// .ignore(config.read_fdignore) -// .parents(config.read_parent_ignore && (config.read_fdignore || config.read_vcsignore)) -// .git_ignore(config.read_vcsignore) -// .git_global(config.read_vcsignore) -// .git_exclude(config.read_vcsignore) -// .require_git(config.require_git_to_read_vcsignore) -// .overrides(overrides) -// .follow_links(config.follow_links) -// // No need to check for supported platforms, option is unavailable on unsupported ones -// .same_file_system(config.one_file_system) -// .max_depth(config.max_depth); -// -// if config.read_fdignore { -// builder.add_custom_ignore_filename(".fdignore"); -// } -// -// if config.read_global_ignore { -// if let Ok(basedirs) = etcetera::choose_base_strategy() { -// let global_ignore_file = basedirs.config_dir().join("fd").join("ignore"); -// if global_ignore_file.is_file() { -// let result = builder.add_ignore(global_ignore_file); -// match result { -// Some(ignore::Error::Partial(_)) => (), -// Some(err) => { -// print_error(format!("Malformed pattern in global ignore file. {err}.")); -// } -// None => (), -// } -// } -// } -// } -// -// for ignore_file in &config.ignore_files { -// let result = builder.add_ignore(ignore_file); -// match result { -// Some(ignore::Error::Partial(_)) => (), -// Some(err) => { -// print_error(format!("Malformed pattern in custom ignore file. {err}.")); -// } -// None => (), -// } -// } -// -// for path in &paths[1..] { -// builder.add(path); -// } -// -// let walker = builder.threads(config.threads).build_parallel(); -// Ok(walker) -// } -// -// /// Run the receiver work, either on this thread or a pool of background -// /// threads (for --exec). -// fn receive(&self, rx: Receiver) -> ExitCode { -// let config = &self.config; -// -// // This will be set to `Some` if the `--exec` argument was supplied. -// if let Some(ref cmd) = config.command { -// if cmd.in_batch_mode() { -// exec::batch(rx.into_iter().flatten(), cmd, config) -// } else { -// let out_perm = Mutex::new(()); -// -// thread::scope(|scope| { -// // Each spawned job will store its thread handle in here. -// let threads = config.threads; -// let mut handles = Vec::with_capacity(threads); -// for _ in 0..threads { -// let rx = rx.clone(); -// -// // Spawn a job thread that will listen for and execute inputs. -// let handle = scope -// .spawn(|| exec::job(rx.into_iter().flatten(), cmd, &out_perm, config)); -// -// // Push the handle of the spawned thread into the vector for later joining. -// handles.push(handle); -// } -// let exit_codes = handles.into_iter().map(|handle| handle.join().unwrap()); -// merge_exitcodes(exit_codes) -// }) -// } -// } else { -// let stdout = io::stdout().lock(); -// let stdout = io::BufWriter::new(stdout); -// -// ReceiverBuffer::new(self, rx, stdout).process() -// } -// } -// -// /// Spawn the sender threads. -// fn spawn_senders(&self, walker: WalkParallel, tx: Sender) { -// walker.run(|| { -// let patterns = &self.patterns; -// let config = &self.config; -// let quit_flag = self.quit_flag.as_ref(); -// -// let mut limit = 0x100; -// if let Some(cmd) = &config.command { -// if !cmd.in_batch_mode() && config.threads > 1 { -// // Evenly distribute work between multiple receivers -// limit = 1; -// } -// } -// let mut tx = BatchSender::new(tx.clone(), limit); -// -// Box::new(move |entry| { -// if quit_flag.load(Ordering::Relaxed) { -// return WalkState::Quit; -// } -// -// let entry = match entry { -// Ok(ref e) if e.depth() == 0 => { -// // Skip the root directory entry. -// return WalkState::Continue; -// } -// Ok(e) => DirEntry::normal(e), -// Err(ignore::Error::WithPath { -// path, -// err: inner_err, -// }) => match inner_err.as_ref() { -// ignore::Error::Io(io_error) -// if io_error.kind() == io::ErrorKind::NotFound -// && path -// .symlink_metadata() -// .ok() -// .is_some_and(|m| m.file_type().is_symlink()) => -// { -// DirEntry::broken_symlink(path) -// } -// _ => { -// return match tx.send(WorkerResult::Error(ignore::Error::WithPath { -// path, -// err: inner_err, -// })) { -// Ok(_) => WalkState::Continue, -// Err(_) => WalkState::Quit, -// } -// } -// }, -// Err(err) => { -// return match tx.send(WorkerResult::Error(err)) { -// Ok(_) => WalkState::Continue, -// Err(_) => WalkState::Quit, -// } -// } -// }; -// -// if let Some(min_depth) = config.min_depth { -// if entry.depth().map_or(true, |d| d < min_depth) { -// return WalkState::Continue; -// } -// } -// -// // Check the name first, since it doesn't require metadata -// let entry_path = entry.path(); -// -// let search_str: Cow = if config.search_full_path { -// let path_abs_buf = filesystem::path_absolute_form(entry_path) -// .expect("Retrieving absolute path succeeds"); -// Cow::Owned(path_abs_buf.as_os_str().to_os_string()) -// } else { -// match entry_path.file_name() { -// Some(filename) => Cow::Borrowed(filename), -// None => unreachable!( -// "Encountered file system entry without a file name. This should only \ -// happen for paths like 'foo/bar/..' or '/' which are not supposed to \ -// appear in a file system traversal." -// ), -// } -// }; -// -// if !patterns -// .iter() -// .all(|pat| pat.is_match(&filesystem::osstr_to_bytes(search_str.as_ref()))) -// { -// return WalkState::Continue; -// } -// -// // Filter out unwanted extensions. -// if let Some(ref exts_regex) = config.extensions { -// if let Some(path_str) = entry_path.file_name() { -// if !exts_regex.is_match(&filesystem::osstr_to_bytes(path_str)) { -// return WalkState::Continue; -// } -// } else { -// return WalkState::Continue; -// } -// } -// -// // Filter out unwanted file types. -// if let Some(ref file_types) = config.file_types { -// if file_types.should_ignore(&entry) { -// return WalkState::Continue; -// } -// } -// -// #[cfg(unix)] -// { -// if let Some(ref owner_constraint) = config.owner_constraint { -// if let Some(metadata) = entry.metadata() { -// if !owner_constraint.matches(metadata) { -// return WalkState::Continue; -// } -// } else { -// return WalkState::Continue; -// } -// } -// } -// -// // Filter out unwanted sizes if it is a file and we have been given size constraints. -// if !config.size_constraints.is_empty() { -// if entry_path.is_file() { -// if let Some(metadata) = entry.metadata() { -// let file_size = metadata.len(); -// if config -// .size_constraints -// .iter() -// .any(|sc| !sc.is_within(file_size)) -// { -// return WalkState::Continue; -// } -// } else { -// return WalkState::Continue; -// } -// } else { -// return WalkState::Continue; -// } -// } -// -// // Filter out unwanted modification times -// if !config.time_constraints.is_empty() { -// let mut matched = false; -// if let Some(metadata) = entry.metadata() { -// if let Ok(modified) = metadata.modified() { -// matched = config -// .time_constraints -// .iter() -// .all(|tf| tf.applies_to(&modified)); -// } -// } -// if !matched { -// return WalkState::Continue; -// } -// } -// -// if config.is_printing() { -// if let Some(ls_colors) = &config.ls_colors { -// // Compute colors in parallel -// entry.style(ls_colors); -// } -// } -// -// let send_result = tx.send(WorkerResult::Entry(entry)); -// -// if send_result.is_err() { -// return WalkState::Quit; -// } -// -// // Apply pruning. -// if config.prune { -// return WalkState::Skip; -// } -// -// WalkState::Continue -// }) -// }); -// } -// -// /// Perform the recursive scan. -// fn scan(&self, paths: &[PathBuf]) -> Result { -// let config = &self.config; -// let walker = self.build_walker(paths)?; -// -// if config.ls_colors.is_some() && config.is_printing() { -// let quit_flag = Arc::clone(&self.quit_flag); -// let interrupt_flag = Arc::clone(&self.interrupt_flag); -// -// ctrlc::set_handler(move || { -// quit_flag.store(true, Ordering::Relaxed); -// -// if interrupt_flag.fetch_or(true, Ordering::Relaxed) { -// // Ctrl-C has been pressed twice, exit NOW -// ExitCode::KilledBySigint.exit(); -// } -// }) -// .unwrap(); -// } -// -// let (tx, rx) = bounded(2 * config.threads); -// -// let exit_code = thread::scope(|scope| { -// // Spawn the receiver thread(s) -// let receiver = scope.spawn(|| self.receive(rx)); -// -// // Spawn the sender threads. -// self.spawn_senders(walker, tx); -// -// receiver.join().unwrap() -// }); -// -// if self.interrupt_flag.load(Ordering::Relaxed) { -// Ok(ExitCode::KilledBySigint) -// } else { -// Ok(exit_code) -// } -// } -// } -// -// /// Recursively scan the given search path for files / pathnames matching the patterns. -// /// -// /// If the `--exec` argument was supplied, this will create a thread pool for executing -// /// jobs in parallel from a given command line and the discovered paths. Otherwise, each -// /// path will simply be written to standard output. -// pub fn scan(paths: &[PathBuf], patterns: Vec, config: Config) -> Result { -// WorkerState::new(patterns, config).scan(paths) -// } \ No newline at end of file +use crossbeam_channel::{bounded, Receiver}; +use ignore::{WalkBuilder, WalkState}; +use std::{path::{Path, PathBuf}, thread}; +use ignore::overrides::OverrideBuilder; +use crate::utils::Config; + +const BATCH_SIZE: usize = 5; + +type Batch = Vec; + +struct Batcher { + tx: crossbeam_channel::Sender, + batch: Batch, +} + +impl Batcher { + fn push(&mut self, p: PathBuf) { + self.batch.push(p); + if self.batch.len() == BATCH_SIZE { + self.flush(); + } + } + fn flush(&mut self) { + if !self.batch.is_empty() { + let _ = self.tx.send(std::mem::take(&mut self.batch)); + } + } +} + +impl Drop for Batcher { + fn drop(&mut self) { + // guarantees the remainder is sent when the worker is dropped + self.flush(); + } +} + + +/// Walk `root`, send file paths to the returned receiver. +pub fn spawn_senders( + root: &Path, + cfg: &Config +) -> Receiver { + let mut ob = OverrideBuilder::new(root); + + for ext in &cfg.scanner.excluded_extensions { + ob.add(&format!("!*.{ext}")).unwrap(); + } + + for dir in &cfg.scanner.excluded_directories { + ob.add(&format!("!**/{dir}/**")).unwrap(); + } + + let overrides = ob.build().unwrap(); + let worker_thrs = cfg.performance.worker_threads.unwrap_or(num_cpus::get()); + + let (tx, rx) = bounded::(worker_thrs * 2usize); + + let root = root.to_path_buf(); + let scan_hidden = cfg.scanner.scan_hidden_files; + let follow_links = cfg.scanner.follow_symlinks; + + thread::spawn(move || { + let walker = WalkBuilder::new(root) + .hidden(!scan_hidden) + .follow_links(follow_links) + .threads(worker_thrs) + .overrides(overrides) + .build_parallel(); + + walker.run(move || { + let tx = tx.clone(); + let mut batch = Vec::::with_capacity(256); + + Box::new(move |entry| { + tracing::info!("walking: {:?}", entry); + + let mut b = Batcher { tx: tx.clone(), batch: Vec::with_capacity(BATCH_SIZE) }; + match entry { + Ok(e) if e.file_type().map_or(false, |ft| ft.is_file()) => { + b.push(e.into_path()); + if batch.len() == BATCH_SIZE { + let _ = tx.send(std::mem::take(&mut batch)); + } + } + Err(err) => eprintln!("walk error: {err}"), + _ => {} + } + WalkState::Continue + }) + }); + + }); + + rx +}