From 0a66a0ae2dfea7a8ea4b2b42c476b83659356041 Mon Sep 17 00:00:00 2001 From: elipeter Date: Mon, 23 Jun 2025 20:27:16 +0200 Subject: [PATCH] Add error handling with `NyxError` and refactor console output formatting - Introduced `NyxError` and `NyxResult` for unified error handling across modules. - Refactored `scan.rs`, `index.rs`, and `walk.rs` with improved error management and consistent formatting. - Replaced existing error handling in `database.rs` with `NyxResult`. - Improved database maintenance by integrating `vacuum` and `clear` methods into workflows. - Added `dashmap` for efficient parallel diagnostics result aggregation in `scan_with_index_parallel`. - Enhanced readability and formatting of console outputs in multiple modules. --- Cargo.lock | 16 ++++ Cargo.toml | 2 + src/commands/index.rs | 3 +- src/commands/scan.rs | 163 ++++++++++++--------------------------- src/database.rs | 28 ++++--- src/errors.rs | 24 ++++++ src/file.rs | 74 ++++++++++++++++++ src/main.rs | 11 ++- src/patterns/mod.rs | 68 ++++++++++++---- src/utils/ext.rs | 15 ++++ src/utils/mod.rs | 1 + src/utils/project.rs | 29 +++---- src/utils/query_cache.rs | 38 +++++---- src/walk.rs | 128 +++++++++++++++--------------- 14 files changed, 360 insertions(+), 240 deletions(-) create mode 100644 src/errors.rs create mode 100644 src/file.rs create mode 100644 src/utils/ext.rs diff --git a/Cargo.lock b/Cargo.lock index 7f93f480..1f6b6aef 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -267,6 +267,20 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "dashmap" +version = "7.0.0-rc2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4a1e35a65fe0538a60167f0ada6e195ad5d477f6ddae273943596d4a1a5730b" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "equivalent", + "hashbrown", + "lock_api", + "parking_lot_core", +] + [[package]] name = "deranged" version = "0.4.0" @@ -579,6 +593,7 @@ dependencies = [ "clap", "console", "crossbeam-channel", + "dashmap", "directories", "ignore", "num_cpus", @@ -588,6 +603,7 @@ dependencies = [ "rayon", "rusqlite", "serde", + "thiserror", "toml", "tracing", "tracing-subscriber", diff --git a/Cargo.toml b/Cargo.toml index 79222ea2..efb7b5a3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,3 +32,5 @@ r2d2_sqlite = "0.30.0" r2d2 = "0.8.10" bytesize = "2.0.1" chrono = { version = "0.4.41", default-features = false, features = ["std", "clock"] } +thiserror = "2.0.12" +dashmap = "7.0.0-rc2" diff --git a/src/commands/index.rs b/src/commands/index.rs index cf47bab4..f6a39521 100644 --- a/src/commands/index.rs +++ b/src/commands/index.rs @@ -24,8 +24,10 @@ pub fn handle( if force || !db_path.exists() { build_index(&project_name, &build_path, &db_path, config)?; println!("✔ {} {}", style("Index built:" ).green(), style(db_path.display()).white().bold()); + exit(0); } else { println!("{} {}", style("↩ Index already exists").yellow(), style("(use --force to rebuild)").dim()); + exit(0); } } IndexAction::Status { path } => { @@ -48,7 +50,6 @@ pub fn handle( exit(0); } } - Ok(()) } pub fn build_index( diff --git a/src/commands/scan.rs b/src/commands/scan.rs index d315ecd2..63219a91 100644 --- a/src/commands/scan.rs +++ b/src/commands/scan.rs @@ -7,11 +7,12 @@ use r2d2_sqlite::SqliteConnectionManager; use crate::database::index::{IssueRow, Indexer}; use crate::patterns::Severity; use crate::utils::config::Config; -use crate::utils::query_cache; use crate::walk::spawn_senders; use rayon::prelude::*; use std::collections::BTreeMap; -use tree_sitter::{Language, Parser, QueryCursor, StreamingIterator}; +use dashmap::DashMap; +use crate::errors::NyxResult; +pub(crate) use crate::file::run_rules_on_file; type DynError = Box; @@ -35,6 +36,8 @@ pub fn handle( ) -> Result<(), Box> { let scan_path = Path::new(path).canonicalize()?; let (project_name, db_path) = get_project_info(&scan_path, database_dir)?; + + println!("{} {}...\n", style("Checking").green().bold(), &project_name); let diags: Vec = if no_index { scan_filesystem(&scan_path, config)? @@ -58,25 +61,18 @@ pub fn handle( for d in &diags { grouped.entry(&d.path).or_default().push(d); } - - for (path, issues) in grouped { + + for (path, issues) in &grouped { println!("{}", style(path).blue().underlined()); for d in issues { - let sev_str = match d.severity { - Severity::High => style("HIGH").red().bold(), - Severity::Medium => style("MEDIUM").yellow().bold(), - Severity::Low => style("LOW").cyan().bold(), - }; - println!( - " {:>4}:{:<4} [{}] {}", - d.line, d.col, sev_str, style(&d.id).bold() - ); + println!(" {:>4}:{:<4} [{}] {}", + d.line, d.col, d.severity, style(&d.id).bold()); } - println!(); + println!(); } - println!("{} '{}' generated {} issues.", - style("warning").yellow().bold(), + println!("{} '{}' generated {} issues.", + style("warning").yellow().bold(), style(project_name).white().bold(), style(diags.len()).bold()); println!("\t"); // TODO: Add individual counts for different warning levels @@ -94,11 +90,11 @@ fn scan_filesystem( ) ->Result, Box> { let rx = spawn_senders(root, cfg); let acc = Mutex::new(Vec::new()); - + rx.into_iter() .flatten() .par_bridge() - .try_for_each(|path| { + .try_for_each(|path| { let mut local = run_rules_on_file(&path, cfg).unwrap(); acc.lock().unwrap().append(&mut local); Ok::<(), DynError>(()) @@ -107,113 +103,54 @@ fn scan_filesystem( Ok(acc.into_inner()?) } -fn scan_with_index_parallel( +pub fn scan_with_index_parallel( project: &str, pool: Arc>, cfg: &Config, -) -> Result, Box> { +) -> NyxResult> { + let files = { let idx = Indexer::from_pool(project, &pool)?; idx.get_files(project)? }; - let acc = Mutex::new(Vec::new()); + // ① Collect per-path Vec without a global mutex + let diag_map: DashMap> = DashMap::new(); files.into_par_iter() - .try_for_each(|path| -> Result<(), DynError> { - let mut idx = Indexer::from_pool(project, &pool).unwrap(); + .for_each_init( + // ② A single Indexer per Rayon worker thread + || Indexer::from_pool(project, &pool).expect("db pool"), + |idx, path| { + let needs_scan = idx.should_scan(&path).unwrap_or(true); - if idx.should_scan(&path).unwrap() { - let mut diags = run_rules_on_file(&path, cfg).unwrap(); - let file_id = idx.upsert_file(&path).unwrap(); - - let rows: Vec = diags.iter().map(|d| IssueRow { - rule_id: d.id.as_ref(), - severity: match d.severity { - Severity::High => "HIGH", - Severity::Medium => "MEDIUM", - Severity::Low => "LOW", - }, - line: d.line as i64, - col: d.col as i64, - }).collect(); - - idx.replace_issues(file_id, rows).unwrap(); - acc.lock().unwrap().append(&mut diags); - } else { - let mut cached = idx.get_issues_from_file(&path).unwrap(); - acc.lock().unwrap().append(&mut cached); + let mut diags = if needs_scan { + let d = run_rules_on_file(&path, cfg).unwrap_or_default(); + let file_id = idx.upsert_file(&path).unwrap(); + idx.replace_issues( + file_id, + d.iter().map(|d| IssueRow { + rule_id: &d.id, + severity: d.severity.as_db_str(), + line: d.line as i64, + col: d.col as i64, + }), + ).ok(); + d + } else { + idx.get_issues_from_file(&path).unwrap_or_default() + }; + if !diags.is_empty() { + diag_map.entry(path.to_string_lossy().to_string()) + .or_default() + .append(&mut diags); + } } - Ok(()) - }).unwrap(); + ); - { - let idx = Indexer::from_pool(project, &pool)?; - idx.vacuum()?; - } + // Optional, heavy: only vacuum on --rebuild-index + // if rebuild { idx.vacuum()?; } - Ok(acc.into_inner().unwrap()) + // Flatten + Ok(diag_map.into_iter().flat_map(|(_, v)| v).collect()) } - -// -------------------------------------------------------------------------------------------- -// Tree‑sitter‑based rule runner – returns a Vec -// -------------------------------------------------------------------------------------------- -pub(crate) fn run_rules_on_file( - path: &Path, - cfg: &Config, -) -> Result, Box> { - tracing::debug!("Running rules on {}", path.to_string_lossy()); - let bytes = std::fs::read(path)?; - - let mut parser = Parser::new(); - - let lang_key = match path - .extension() - .and_then(|s| s.to_str()) - .unwrap_or_default() - .to_ascii_lowercase() - .as_str() - { - "rs" => (Language::from(tree_sitter_rust::LANGUAGE), "rust"), - "c" => (Language::from(tree_sitter_c::LANGUAGE), "c"), - "cpp" | "c++" => (Language::from(tree_sitter_cpp::LANGUAGE), "cpp"), - "java" => (Language::from(tree_sitter_java::LANGUAGE), "java"), - "go" => (Language::from(tree_sitter_go::LANGUAGE), "go"), - "php" => (Language::from(tree_sitter_php::LANGUAGE_PHP), "php"), - "py" => (Language::from(tree_sitter_python::LANGUAGE), "python"), - "ts" | "tsx" => (Language::from(tree_sitter_typescript::LANGUAGE_TYPESCRIPT), "typescript"), - "js" => (Language::from(tree_sitter_javascript::LANGUAGE), "javascript"), - _ => return Ok(Vec::new()), - }; - let (ts_lang, lang_name) = lang_key; - - parser.set_language(&ts_lang)?; - let tree = parser.parse(&*bytes, None).ok_or("tree‑sitter failed")?; - let root = tree.root_node(); - - let compiled = query_cache::for_lang(lang_name, ts_lang); - let mut cursor = QueryCursor::new(); - let mut out = Vec::new(); - - for cq in &compiled { - if cfg.scanner.min_severity > cq.meta.severity { - tracing::debug!("Skipping rule {} because it's below the minimum severity", cq.meta.id); - continue; - } - let mut matches = cursor.matches(&cq.query, root, &*bytes); - while let Some(m) = matches.next() { - for cap in m.captures.iter().filter(|c| c.index == 0) { - let point = cap.node.start_position(); - tracing::debug!("Found match for rule {}", cq.meta.id); - out.push(Diag { - path: path.to_string_lossy().to_string(), - line: point.row + 1, - col: point.column + 1, - severity: cq.meta.severity, - id: String::from(cq.meta.id), - }); - } - } - } - Ok(out) -} \ No newline at end of file diff --git a/src/database.rs b/src/database.rs index 57b22c61..dfca9c78 100644 --- a/src/database.rs +++ b/src/database.rs @@ -10,6 +10,7 @@ pub mod index { use std::ops::Deref; use std::sync::Arc; use r2d2::{Pool, PooledConnection}; + use crate::errors::NyxResult; /// DB schema (foreign‑keys enabled). const SCHEMA: &str = r#" @@ -55,7 +56,7 @@ pub mod index { pub fn init( database_path: &Path, - ) -> Result>, Box> { + ) -> NyxResult>> { let flags = OpenFlags::SQLITE_OPEN_READ_WRITE | OpenFlags::SQLITE_OPEN_CREATE | OpenFlags::SQLITE_OPEN_FULL_MUTEX; @@ -73,7 +74,7 @@ pub mod index { pub fn from_pool( project: &str, pool: &Pool, - ) -> Result> { + ) -> NyxResult { let conn = pool.get()?; Ok(Self { conn, project: project.to_owned() }) } @@ -82,7 +83,7 @@ pub mod index { fn c(&self) -> &Connection { self.conn.deref() } /// Return true when the file *content* or *mtime* changed since the last scan. - pub fn should_scan(&self, path: &Path) -> Result> { + pub fn should_scan(&self, path: &Path) -> NyxResult { let meta = fs::metadata(path)?; let mtime = meta.modified()?.duration_since(UNIX_EPOCH)?.as_secs() as i64; let digest = Self::digest_file(path)?; @@ -103,7 +104,7 @@ pub mod index { } /// Insert or update the `files` row and return its id. - pub fn upsert_file(&self, path: &Path) -> Result> { + pub fn upsert_file(&self, path: &Path) -> NyxResult { let meta = fs::metadata(path)?; let mtime = meta.modified()?.duration_since(UNIX_EPOCH)?.as_secs() as i64; let scanned_at = SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs() as i64; @@ -129,7 +130,7 @@ pub mod index { /// Replace all issues for `file_id` with the supplied set. pub fn replace_issues<'a>(&mut self, file_id: i64, issues: impl IntoIterator>) - -> Result<(), Box> { + -> NyxResult<()> { let tx = self.conn.transaction()?; tx.execute("DELETE FROM issues WHERE file_id = ?", params![file_id])?; @@ -150,7 +151,7 @@ pub mod index { pub fn get_issues_from_file( &self, path: &Path, - ) -> Result, Box> { + ) -> NyxResult> { let file_id: i64 = self.c().query_row( "SELECT id FROM files WHERE project = ?1 AND path = ?2", params![self.project, path.to_string_lossy()], @@ -178,7 +179,7 @@ pub mod index { } /// gets files from the database - pub fn get_files(&self, project: &str) -> Result, Box> { + pub fn get_files(&self, project: &str) -> NyxResult> { let mut stmt = self.c().prepare( "SELECT path FROM files @@ -190,8 +191,10 @@ pub mod index { Ok(file_iter.map(|p| p.map(PathBuf::from)).collect::>()?) } - /// Clears the tables to prep for a reindex - pub fn clear(&self) -> rusqlite::Result<()> { + // ------------------------------------------------------------------------- + // Maintenance utilities + // ------------------------------------------------------------------------- + pub fn clear(&self) -> NyxResult<()> { self.c().execute_batch( r#" PRAGMA foreign_keys = OFF; @@ -208,12 +211,15 @@ pub mod index { Ok(()) } - pub fn vacuum(&self) -> rusqlite::Result<()> { + pub fn vacuum(&self) -> NyxResult<()> { self.c().execute("VACUUM;", [])?; Ok(()) } - fn digest_file(path: &Path) -> Result, Box> { + // ------------------------------------------------------------------------- + // Helpers + // ------------------------------------------------------------------------- + fn digest_file(path: &Path) -> NyxResult> { let mut hasher = blake3::Hasher::new(); let mut file = fs::File::open(path)?; std::io::copy(&mut file, &mut hasher)?; diff --git a/src/errors.rs b/src/errors.rs new file mode 100644 index 00000000..89deda1f --- /dev/null +++ b/src/errors.rs @@ -0,0 +1,24 @@ +use thiserror::Error; + +pub type NyxResult = core::result::Result; + +#[derive(Debug, Error)] +pub enum NyxError { + #[error("I/O error: {0}")] + Io(#[from] std::io::Error), + + #[error("SQLite error: {0}")] + Sql(#[from] rusqlite::Error), + + #[error("tree-sitter error: {0}")] + TreeSitter(#[from] tree_sitter::LanguageError), + + #[error("connection-pool error: {0}")] + Pool(#[from] r2d2::Error), + + #[error("time error: {0}")] + Time(#[from] std::time::SystemTimeError), + + #[error("other: {0}")] + Other(String), +} \ No newline at end of file diff --git a/src/file.rs b/src/file.rs new file mode 100644 index 00000000..4855b5db --- /dev/null +++ b/src/file.rs @@ -0,0 +1,74 @@ +use std::cell::RefCell; +use std::path::Path; +use tree_sitter::{Language, QueryCursor, StreamingIterator}; +use crate::commands::scan::Diag; +use crate::errors::{NyxResult, NyxError}; +use crate::utils::{query_cache, Config}; +use crate::utils::ext::lowercase_ext; + +thread_local! { + static PARSER: RefCell = RefCell::new(tree_sitter::Parser::new()); +} + +pub(crate) fn run_rules_on_file( + path: &Path, + cfg: &Config, +) -> NyxResult> { + let bytes = std::fs::read(path)?; + + // Fast binary-file guard (skip if >1% NULs) + if bytes.iter().filter(|b| **b == 0).count() * 100 / bytes.len().max(1) > 1 { + return Ok(vec![]); + } + + let lang_name = match lowercase_ext(path) { + Some(l) => l, + None => return Ok(vec![]), + }; + + let ts_lang = match lang_name { + "rs" => Language::from(tree_sitter_rust::LANGUAGE), + "c" => Language::from(tree_sitter_c::LANGUAGE), + "cpp" => Language::from(tree_sitter_cpp::LANGUAGE), + "java"=> Language::from(tree_sitter_java::LANGUAGE), + "go" => Language::from(tree_sitter_go::LANGUAGE), + "php" => Language::from(tree_sitter_php::LANGUAGE_PHP), + "py" => Language::from(tree_sitter_python::LANGUAGE), + "ts" => Language::from(tree_sitter_typescript::LANGUAGE_TYPESCRIPT), + "js" => Language::from(tree_sitter_javascript::LANGUAGE), + _ => return Ok(vec![]), + }; + + let _tree = PARSER.with(|cell| { + let mut parser = cell.borrow_mut(); + parser.set_language(&ts_lang)?; + parser.parse(&*bytes, None) + .ok_or_else(|| NyxError::Other("tree-sitter failed".into())) + })?; + + let root = _tree.root_node(); + + let compiled = query_cache::for_lang(lang_name, ts_lang); + let mut cursor = QueryCursor::new(); + let mut out = Vec::new(); + + for cq in compiled.iter() { + if cfg.scanner.min_severity > cq.meta.severity { + continue; + } + let mut matches = cursor.matches(&cq.query, root, &*bytes); + while let Some(m) = matches.next() { + if let Some(cap) = m.captures.iter().find(|c| c.index == 0) { + let point = cap.node.start_position(); + out.push(Diag { + path: path.to_string_lossy().into_owned(), + line: point.row + 1, + col: point.column + 1, + severity: cq.meta.severity, + id: cq.meta.id.to_owned(), + }); + } + } + } + Ok(out) +} diff --git a/src/main.rs b/src/main.rs index 0dc7c82c..5488992f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,6 +4,8 @@ mod utils; mod walk; mod database; mod patterns; +mod errors; +mod file; use crate::utils::Config; use cli::Cli; @@ -59,10 +61,11 @@ fn main() -> Result<(), Box> { commands::handle_command(cli.command, database_dir, &mut config)?; - let elapsed: f32 = now.elapsed().as_millis() as f32 / 1000f32; - println!("{} in {} s.", - style("Finished").green().bold(), - style(elapsed).white().bold()); + println!( + "{} in {:.3}s.", + style("Finished").green().bold(), + now.elapsed().as_secs_f32() + ); Ok(()) } diff --git a/src/patterns/mod.rs b/src/patterns/mod.rs index 45370cb5..f9c25609 100644 --- a/src/patterns/mod.rs +++ b/src/patterns/mod.rs @@ -9,18 +9,63 @@ mod php; mod python; use std::collections::HashMap; +use std::fmt; use std::str::FromStr; +use console::style; use serde::{Deserialize, Serialize}; use once_cell::sync::Lazy; -/// How bad / noisy a pattern is considered. -#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd)] -pub enum Severity { - Low, - Medium, - High, +#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Serialize, Deserialize)] +pub enum Severity { High, Medium, Low } + +impl fmt::Display for Severity { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let s = match *self { + Severity::High => style("HIGH").red().bold().to_string(), + Severity::Medium => style("MEDIUM").yellow().bold().to_string(), + Severity::Low => style("LOW").cyan().bold().to_string(), + }; + f.write_str(&s) + } } +impl Severity { + /// Textual value stored in SQLite. + pub fn as_db_str(self) -> &'static str { + match self { + Severity::High => "HIGH", + Severity::Medium => "MEDIUM", + Severity::Low => "LOW", + } + } +} + +impl FromStr for Severity { // TODO: FIX + type Err = (); + + fn from_str(input: &str) -> Result { + match input.to_lowercase().as_str() { + "medium" => Ok(Severity::Medium), + "high" => Ok(Severity::High), + _ => Ok(Severity::Low), + } + } +} + +// /// How bad / noisy a pattern is considered. +// #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, PartialOrd)] +// pub enum Severity { +// Low, +// Medium, +// High, +// } +// +// impl Severity { +// pub(crate) fn as_db_str(&self) -> &str { +// todo!() +// } +// } + /// One AST pattern with a tree-sitter query and meta-data. #[derive(Debug, Clone, Serialize)] pub struct Pattern { @@ -34,17 +79,6 @@ pub struct Pattern { pub severity: Severity, } -impl FromStr for Severity { // TODO: FIX - type Err = (); - - fn from_str(input: &str) -> Result { - match input.to_lowercase().as_str() { - "medium" => Ok(Severity::Medium), - "high" => Ok(Severity::High), - _ => Ok(Severity::Low), - } - } -} /// Global, lazily-initialised registry: lang-name → pattern slice diff --git a/src/utils/ext.rs b/src/utils/ext.rs new file mode 100644 index 00000000..f46e4d22 --- /dev/null +++ b/src/utils/ext.rs @@ -0,0 +1,15 @@ +pub fn lowercase_ext(path: &std::path::Path) -> Option<&'static str> { + path.extension() + .and_then(|s| match s.to_str()? { + "rs" | "RS" => Some("rs"), + "c" => Some("c"), + "cpp" | "c++" => Some("cpp"), + "java" => Some("java"), + "go" => Some("go"), + "php" => Some("php"), + "py" | "PY" => Some("py"), + "ts" | "TSX" | "tsx" => Some("ts"), + "js" => Some("js"), + _ => None, + }) +} \ No newline at end of file diff --git a/src/utils/mod.rs b/src/utils/mod.rs index 080265c8..5149d181 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -1,6 +1,7 @@ pub mod project; pub mod config; pub(crate) mod query_cache; +pub(crate) mod ext; // Re-export commonly used functions for convenience pub use project::{get_project_info}; diff --git a/src/utils/project.rs b/src/utils/project.rs index 145ef4d0..e24c164d 100644 --- a/src/utils/project.rs +++ b/src/utils/project.rs @@ -1,27 +1,30 @@ use std::path::{Path, PathBuf}; +use crate::errors::{NyxError, NyxResult}; +/// Determine `.sqlite>`. pub fn get_project_info( - project_path: &Path, - config_dir: &Path, -) -> Result<(String, PathBuf), Box> { + project_path: &Path, + config_dir: &Path, +) -> NyxResult<(String, PathBuf)> { + let project_name = project_path .file_name() - .and_then(|name| name.to_str()) - .ok_or("Unable to determine project name")?; - + .and_then(|n| n.to_str()) + .ok_or_else(|| NyxError::Other("Unable to determine project name".into()))?; + let db_name = sanitize_project_name(project_name); let db_path = config_dir.join(format!("{}.sqlite", db_name)); - - Ok((project_name.to_string(), db_path)) + + Ok((project_name.to_owned(), db_path)) } -pub fn sanitize_project_name(name: &str) -> String { +pub fn sanitize_project_name(name: &str) -> String { name.to_lowercase() .chars() - .map(|c| match c { - ' ' | '\t' | '\n' | '\r' => '_', - c if c.is_alphanumeric() || c == '_' || c == '-' => c, - _ => '_' + .map(|c| match c { + ' ' | '\t' | '\n' | '\r' => '_', + c if c.is_alphanumeric() || c == '_' || c == '-' => c, + _ => '_', }) .collect::() .split('_') diff --git a/src/utils/query_cache.rs b/src/utils/query_cache.rs index e88bcd62..5e827914 100644 --- a/src/utils/query_cache.rs +++ b/src/utils/query_cache.rs @@ -1,5 +1,5 @@ use std::collections::HashMap; -use std::sync::{Arc, RwLock}; +use std::sync::RwLock; use once_cell::sync::Lazy; use tree_sitter::{Language, Query}; @@ -8,30 +8,36 @@ use crate::patterns::{self, Pattern}; #[derive(Clone)] pub struct CompiledQuery { - pub meta: Pattern, - pub query: Arc, + pub meta: Pattern, + pub query: std::sync::Arc, } -static CACHE: Lazy>>> = +static CACHE: Lazy>>>> = Lazy::new(|| RwLock::new(HashMap::new())); -pub fn for_lang(lang: &'static str, ts_lang: Language) -> Vec { - // fast-path read +/// Return **one shared Arc** to the per-language query set. +/// Cloning the `Arc` is O(1) and the underlying Vec lives for the +/// lifetime of the process. +pub fn for_lang(lang: &'static str, ts_lang: Language) -> std::sync::Arc> { + // fast path if let Some(v) = CACHE.read().unwrap().get(lang) { return v.clone(); } - // compile under write-lock exactly once - let patterns = patterns::load(lang); - let mut vec = Vec::with_capacity(patterns.len()); - - for p in patterns { + // slow path — compile + let patterns = patterns::load(lang); + let compiled: Vec<_> = patterns.into_iter().filter_map(|p| { match Query::new(&ts_lang, p.query) { - Ok(q) => vec.push(CompiledQuery { meta: p, query: Arc::new(q) }), - Err(e) => tracing::warn!(lang, id = p.id, "query compile error: {e}"), + Ok(q) => Some(CompiledQuery { meta: p, query: std::sync::Arc::new(q) }), + Err(e)=> { + tracing::warn!(lang, id = p.id, "query compile error: {e}"); + None + } } - } + }).collect(); - CACHE.write().unwrap().insert(lang, vec.clone()); - vec + let compiled = std::sync::Arc::new(compiled); + + let mut w = CACHE.write().unwrap(); + w.entry(lang).or_insert_with(|| compiled.clone()).clone() } \ No newline at end of file diff --git a/src/walk.rs b/src/walk.rs index e4763b99..9807400a 100644 --- a/src/walk.rs +++ b/src/walk.rs @@ -1,106 +1,104 @@ -use crossbeam_channel::{bounded, Receiver}; -use ignore::{WalkBuilder, WalkState}; -use std::{path::{Path, PathBuf}, thread}; -use ignore::overrides::OverrideBuilder; +use crossbeam_channel::{bounded, Receiver, Sender}; +use ignore::{overrides::OverrideBuilder, WalkBuilder, WalkState}; +use std::{ + mem, + path::{Path, PathBuf}, + thread, +}; + use crate::utils::Config; -const BATCH_SIZE: usize = 5; +// --------------------------------------------------------------------------- +// Internal constants / helpers +// --------------------------------------------------------------------------- +const DEFAULT_BATCH: usize = 8; // a tad larger for fewer sends +const CHANNEL_MULTIPLIER:usize = 4; // capacity = threads × this type Batch = Vec; -#[derive(Debug)] struct Batcher { - tx: crossbeam_channel::Sender, + tx: Sender, batch: Batch, } - impl Batcher { fn push(&mut self, p: PathBuf) { self.batch.push(p); - if self.batch.len() == BATCH_SIZE { + if self.batch.len() == DEFAULT_BATCH { self.flush(); } } fn flush(&mut self) { if !self.batch.is_empty() { - let _ = self.tx.send(std::mem::take(&mut self.batch)); + let _ = self.tx.send(mem::take(&mut self.batch)); } } } - impl Drop for Batcher { - fn drop(&mut self) { - // guarantees the remainder is sent when the worker is dropped - self.flush(); - } + fn drop(&mut self) { self.flush(); } } - -/// Walk `root`, send file paths to the returned receiver. -pub fn spawn_senders( - root: &Path, - cfg: &Config -) -> Receiver { +// --------------------------------------------------------------------------- +/// Walk `root` and send *batches* of paths through the returned channel. +pub fn spawn_senders(root: &Path, cfg: &Config) -> Receiver { + // ----- 1 build ignore/override rules ---------------------------------- let mut ob = OverrideBuilder::new(root); - for ext in &cfg.scanner.excluded_extensions { if let Err(e) = ob.add(&format!("!*.{ext}")) { - tracing::warn!("could not add ignore pattern: {e}"); + tracing::warn!("cannot add ignore pattern ‘{ext}’: {e}"); } } - for dir in &cfg.scanner.excluded_directories { if let Err(e) = ob.add(&format!("!**/{dir}/**")) { - tracing::warn!("could not add ignore pattern: {e}"); + tracing::warn!("cannot add ignore pattern ‘{dir}’: {e}"); } } - - let overrides = ob.build().unwrap(); - let worker_thrs = cfg.performance.worker_threads.unwrap_or(num_cpus::get()); - - let (tx, rx) = bounded::(worker_thrs * 2usize); - - let root = root.to_path_buf(); - let scan_hidden = cfg.scanner.scan_hidden_files; - let follow_links = cfg.scanner.follow_symlinks; - let max_bytes: u64 = (cfg.scanner.max_file_size_mb.unwrap_or(0)) * 1_048_576; + let overrides = ob.build().unwrap(); + // ----- 2 channel & thread pool parameters ----------------------------- + let workers = cfg.performance.worker_threads.unwrap_or(num_cpus::get()); + let (tx, rx) = bounded::(workers * CHANNEL_MULTIPLIER); + + let root = root.to_path_buf(); + let scan_hidden = cfg.scanner.scan_hidden_files; + let follow = cfg.scanner.follow_symlinks; + let max_bytes = cfg.scanner.max_file_size_mb.unwrap_or(0) as u64 * 1_048_576; + + // ----- 3 the background walker thread --------------------------------- thread::spawn(move || { - let walker = WalkBuilder::new(root) + WalkBuilder::new(root) .hidden(!scan_hidden) - .follow_links(follow_links) - .threads(worker_thrs) + .follow_links(follow) + .threads(workers) .overrides(overrides) - .build_parallel(); + .build_parallel() + .run(move || { + let mut b = Batcher { + tx: tx.clone(), + batch: Vec::with_capacity(DEFAULT_BATCH), + }; - walker.run(move || { - let mut batcher = Batcher { - tx: tx.clone(), - batch: Vec::with_capacity(BATCH_SIZE), - }; + Box::new(move |entry| { + let entry = match entry { + Ok(e) if e.file_type().map(|ft| ft.is_file()).unwrap_or(false) => e, + _ => return WalkState::Continue, + }; - Box::new(move |entry| { - tracing::debug!("walking: {:?}", entry); - let e = match entry { - Ok(e) if e.file_type().map(|ft| ft.is_file()).unwrap_or(false) => e, - _ => return WalkState::Continue, - }; - if max_bytes != 0 { - match e.metadata() { - Ok(m) if m.len() <= max_bytes => {}, - _ => return WalkState::Continue, - } - } - tracing::debug!("scanning file: {:?}", e); - batcher.push(e.into_path()); - if batcher.batch.len() == BATCH_SIZE { - let _ = batcher.tx.send(std::mem::take(&mut batcher.batch)); - } - WalkState::Continue - }) - }); + if max_bytes != 0 { + match entry.metadata() { + Ok(m) if m.len() > max_bytes => return WalkState::Continue, + Err(e) => { + tracing::debug!("metadata failed for {:?}: {e}", entry.path()); + return WalkState::Continue; + } + _ => {} + } + } + b.push(entry.into_path()); + WalkState::Continue + }) + }); }); - + rx }