2025-06-24 20:27:06 +02:00
|
|
|
|
use crossbeam_channel::{Receiver, Sender, bounded};
|
|
|
|
|
|
use ignore::{WalkBuilder, WalkState, overrides::OverrideBuilder};
|
2025-06-23 20:27:16 +02:00
|
|
|
|
use std::{
|
|
|
|
|
|
mem,
|
|
|
|
|
|
path::{Path, PathBuf},
|
|
|
|
|
|
thread,
|
|
|
|
|
|
};
|
|
|
|
|
|
|
2025-06-16 23:47:50 +02:00
|
|
|
|
use crate::utils::Config;
|
|
|
|
|
|
|
2025-06-23 20:27:16 +02:00
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
// Internal constants / helpers
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
2025-06-16 23:47:50 +02:00
|
|
|
|
|
|
|
|
|
|
type Batch = Vec<PathBuf>;
|
|
|
|
|
|
|
|
|
|
|
|
struct Batcher {
|
2025-06-24 20:27:06 +02:00
|
|
|
|
tx: Sender<Batch>,
|
2025-06-16 23:47:50 +02:00
|
|
|
|
batch: Batch,
|
|
|
|
|
|
}
|
|
|
|
|
|
impl Batcher {
|
2025-06-24 20:27:06 +02:00
|
|
|
|
fn push(&mut self, p: PathBuf, batch_size: usize) {
|
2025-06-16 23:47:50 +02:00
|
|
|
|
self.batch.push(p);
|
2025-06-24 20:27:06 +02:00
|
|
|
|
if self.batch.len() == batch_size {
|
2025-06-16 23:47:50 +02:00
|
|
|
|
self.flush();
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
fn flush(&mut self) {
|
|
|
|
|
|
if !self.batch.is_empty() {
|
2025-06-23 20:27:16 +02:00
|
|
|
|
let _ = self.tx.send(mem::take(&mut self.batch));
|
2025-06-16 23:47:50 +02:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
impl Drop for Batcher {
|
2025-06-24 20:27:06 +02:00
|
|
|
|
fn drop(&mut self) {
|
|
|
|
|
|
self.flush();
|
|
|
|
|
|
}
|
2025-06-16 23:47:50 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-06-23 20:27:16 +02:00
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
/// Walk `root` and send *batches* of paths through the returned channel.
|
|
|
|
|
|
pub fn spawn_senders(root: &Path, cfg: &Config) -> Receiver<Batch> {
|
|
|
|
|
|
// ----- 1 build ignore/override rules ----------------------------------
|
2025-06-16 23:47:50 +02:00
|
|
|
|
let mut ob = OverrideBuilder::new(root);
|
|
|
|
|
|
for ext in &cfg.scanner.excluded_extensions {
|
2025-06-17 02:22:14 +02:00
|
|
|
|
if let Err(e) = ob.add(&format!("!*.{ext}")) {
|
2025-06-23 20:27:16 +02:00
|
|
|
|
tracing::warn!("cannot add ignore pattern ‘{ext}’: {e}");
|
2025-06-17 02:22:14 +02:00
|
|
|
|
}
|
2025-06-16 23:47:50 +02:00
|
|
|
|
}
|
|
|
|
|
|
for dir in &cfg.scanner.excluded_directories {
|
2025-06-17 02:22:14 +02:00
|
|
|
|
if let Err(e) = ob.add(&format!("!**/{dir}/**")) {
|
2025-06-23 20:27:16 +02:00
|
|
|
|
tracing::warn!("cannot add ignore pattern ‘{dir}’: {e}");
|
2025-06-17 02:22:14 +02:00
|
|
|
|
}
|
2025-06-16 23:47:50 +02:00
|
|
|
|
}
|
2025-06-24 20:27:06 +02:00
|
|
|
|
let overrides = ob.build().unwrap();
|
2025-06-23 20:27:16 +02:00
|
|
|
|
|
|
|
|
|
|
// ----- 2 channel & thread pool parameters -----------------------------
|
2025-06-24 20:27:06 +02:00
|
|
|
|
let workers = cfg.performance.worker_threads.unwrap_or(num_cpus::get());
|
|
|
|
|
|
let (tx, rx) = bounded::<Batch>(workers * cfg.performance.channel_multiplier);
|
2025-06-16 23:47:50 +02:00
|
|
|
|
|
2025-06-24 20:27:06 +02:00
|
|
|
|
let root = root.to_path_buf();
|
2025-06-23 20:27:16 +02:00
|
|
|
|
let scan_hidden = cfg.scanner.scan_hidden_files;
|
2025-06-24 20:27:06 +02:00
|
|
|
|
let follow = cfg.scanner.follow_symlinks;
|
|
|
|
|
|
let max_bytes = cfg.scanner.max_file_size_mb.unwrap_or(0) * 1_048_576;
|
|
|
|
|
|
let batch_size = cfg.performance.batch_size;
|
2025-06-23 20:27:16 +02:00
|
|
|
|
|
|
|
|
|
|
// ----- 3 the background walker thread ---------------------------------
|
2025-06-16 23:47:50 +02:00
|
|
|
|
thread::spawn(move || {
|
2025-06-23 20:27:16 +02:00
|
|
|
|
WalkBuilder::new(root)
|
2025-06-24 20:27:06 +02:00
|
|
|
|
.hidden(!scan_hidden)
|
|
|
|
|
|
.follow_links(follow)
|
|
|
|
|
|
.threads(workers)
|
|
|
|
|
|
.overrides(overrides)
|
|
|
|
|
|
.build_parallel()
|
|
|
|
|
|
.run(move || {
|
|
|
|
|
|
let mut b = Batcher {
|
|
|
|
|
|
tx: tx.clone(),
|
|
|
|
|
|
batch: Vec::with_capacity(batch_size),
|
|
|
|
|
|
};
|
2025-06-23 18:25:10 +02:00
|
|
|
|
|
2025-06-24 20:27:06 +02:00
|
|
|
|
Box::new(move |entry| {
|
|
|
|
|
|
tracing::debug!("walking {:?}", entry);
|
|
|
|
|
|
let entry = match entry {
|
|
|
|
|
|
Ok(e) if e.file_type().map(|ft| ft.is_file()).unwrap_or(false) => e,
|
|
|
|
|
|
_ => return WalkState::Continue,
|
|
|
|
|
|
};
|
2025-06-23 18:25:10 +02:00
|
|
|
|
|
2025-06-24 20:27:06 +02:00
|
|
|
|
if max_bytes != 0 {
|
|
|
|
|
|
match entry.metadata() {
|
|
|
|
|
|
Ok(m) if m.len() > max_bytes => return WalkState::Continue,
|
|
|
|
|
|
Err(e) => {
|
|
|
|
|
|
tracing::debug!("metadata failed for {:?}: {e}", entry.path());
|
|
|
|
|
|
return WalkState::Continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
_ => {}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-06-24 18:06:02 +02:00
|
|
|
|
|
2025-06-24 20:27:06 +02:00
|
|
|
|
tracing::debug!("sending {:?}", entry);
|
|
|
|
|
|
b.push(entry.into_path(), batch_size);
|
|
|
|
|
|
WalkState::Continue
|
|
|
|
|
|
})
|
|
|
|
|
|
});
|
2025-06-16 23:47:50 +02:00
|
|
|
|
});
|
2025-06-23 20:27:16 +02:00
|
|
|
|
|
2025-06-16 23:47:50 +02:00
|
|
|
|
rx
|
|
|
|
|
|
}
|
2025-06-24 23:38:32 +02:00
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
|
fn walker_respects_excluded_extensions() {
|
|
|
|
|
|
let tmp = tempfile::tempdir().unwrap();
|
|
|
|
|
|
std::fs::write(tmp.path().join("keep.rs"), "fn main(){}").unwrap();
|
|
|
|
|
|
std::fs::write(tmp.path().join("skip.txt"), "ignored").unwrap();
|
|
|
|
|
|
|
|
|
|
|
|
let mut cfg = Config::default();
|
|
|
|
|
|
cfg.scanner.excluded_extensions = vec!["txt".into()];
|
|
|
|
|
|
cfg.performance.worker_threads = Some(1);
|
|
|
|
|
|
cfg.performance.channel_multiplier = 1;
|
|
|
|
|
|
cfg.performance.batch_size = 2;
|
|
|
|
|
|
|
|
|
|
|
|
let rx = spawn_senders(tmp.path(), &cfg);
|
|
|
|
|
|
|
|
|
|
|
|
let all: Vec<_> = rx.into_iter().flatten().collect();
|
|
|
|
|
|
|
|
|
|
|
|
assert!(all.iter().any(|p| p.ends_with("keep.rs")));
|
|
|
|
|
|
assert!(all.iter().all(|p| !p.ends_with("skip.txt")));
|
|
|
|
|
|
}
|