Feat/configurable sanitizers and js precision (#32)

* chore: Exclude CLAUDE.md from Cargo.toml * feat: Add configurable analysis rules and CLI commands for custom sanitizers and terminators * feat: Enhance resource management and analysis efficiency - Implemented parallel summary merging in `scan_filesystem` using rayon for improved performance. - Introduced `GlobalSummaries::merge()` for efficient merging of summaries. - Optimized file reading and hashing to eliminate redundant I/O operations. - Added `should_scan_with_hash()` and `upsert_file_with_hash()` methods to streamline file processing. - Enhanced taint analysis with in-place mutations to reduce memory allocations. - Updated resource acquisition patterns to exclude false positives for `freopen` and wrapper functions. * feat: Implement severity downgrade for findings in non-production paths and add source kind inference * feat: Update versioning information in SECURITY.md for new stable line * feat: Update categories in Cargo.toml to include parser-implementations and text-processing * feat: Update dependencies in Cargo.lock for improved compatibility and performance * feat: Update dependencies in Cargo.lock and Cargo.toml for improved compatibility
2026-06-12 19:55:14 +02:00 · 2026-02-25 04:02:11 -05:00 · 2026-02-25 04:02:11 -05:00 · 19b578c5c4
commit 19b578c5c4
parent f96a89e7c1
37 changed files with 3775 additions and 432 deletions
--- a/src/database.rs
+++ b/src/database.rs
@ -68,9 +68,13 @@ pub mod index {
    impl Indexer {
        pub fn init(database_path: &Path) -> NyxResult<Arc<Pool<SqliteConnectionManager>>> {
            let _span = tracing::info_span!("db_init", path = %database_path.display()).entered();
+            // NO_MUTEX is safe because r2d2 ensures each pooled connection
+            // is only ever used by one thread at a time.  Combined with WAL
+            // mode this allows concurrent readers + a single writer without
+            // the global serialization that FULL_MUTEX causes.
            let flags = OpenFlags::SQLITE_OPEN_READ_WRITE
                | OpenFlags::SQLITE_OPEN_CREATE
-                | OpenFlags::SQLITE_OPEN_FULL_MUTEX;
+                | OpenFlags::SQLITE_OPEN_NO_MUTEX;
            let manager = SqliteConnectionManager::file(database_path).with_flags(flags);
            let pool = Arc::new(Pool::new(manager)?);

@ -132,10 +136,13 @@ pub mod index {
        }

        /// Return true when the file *content* or *mtime* changed since the last scan.
+        ///
+        /// Short-circuits on mtime: if the stored mtime matches the
+        /// filesystem mtime, the file is assumed unchanged (skip hash).
+        #[allow(dead_code)] // used in tests and by should_scan_with_hash callers may fall back
        pub fn should_scan(&self, path: &Path) -> NyxResult<bool> {
            let meta = fs::metadata(path)?;
            let mtime = meta.modified()?.duration_since(UNIX_EPOCH)?.as_secs() as i64;
-            let digest = Self::digest_file(path)?;

            let row: Option<(Vec<u8>, i64)> = self
                .conn
@ -147,18 +154,56 @@ pub mod index {
                .optional()?;

            Ok(match row {
-                Some((stored_hash, stored_mtime)) => stored_hash != digest || stored_mtime != mtime,
+                Some((stored_hash, stored_mtime)) => {
+                    if stored_mtime != mtime {
+                        // mtime changed — must re-scan
+                        true
+                    } else {
+                        // mtime matches — compare hash only if cheap
+                        // (the caller already read the file and can use
+                        // should_scan_with_hash instead for full accuracy)
+                        let digest = Self::digest_file(path)?;
+                        stored_hash != digest
+                    }
+                }
+                None => true,
+            })
+        }
+
+        /// Like [`should_scan`] but accepts a pre-computed hash to avoid
+        /// redundant file reads.
+        pub fn should_scan_with_hash(&self, path: &Path, hash: &[u8]) -> NyxResult<bool> {
+            let row: Option<Vec<u8>> = self
+                .conn
+                .query_row(
+                    "SELECT hash FROM files WHERE project = ?1 AND path = ?2",
+                    params![self.project, path.to_string_lossy()],
+                    |r| r.get(0),
+                )
+                .optional()?;
+
+            Ok(match row {
+                Some(stored_hash) => stored_hash != hash,
                None => true,
            })
        }

        /// Insert or update the `files` row and return its id.
        pub fn upsert_file(&self, path: &Path) -> NyxResult<i64> {
+            let bytes = fs::read(path)?;
+            let hash = Self::digest_bytes(&bytes);
+            self.upsert_file_with_hash(path, &hash)
+        }
+
+        /// Insert or update the `files` row using a pre-computed hash.
+        /// Avoids redundant file reads when the caller already has the hash.
+        pub fn upsert_file_with_hash(&self, path: &Path, hash: &[u8]) -> NyxResult<i64> {
            let meta = fs::metadata(path)?;
            let mtime = meta.modified()?.duration_since(UNIX_EPOCH)?.as_secs() as i64;
            let scanned_at = SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs() as i64;
-            let digest = Self::digest_file(path)?;
+            let path_str = path.to_string_lossy();

+            // Use a single statement: upsert then query the id.
            self.c().execute(
                "INSERT INTO files (project, path, hash, mtime, scanned_at)
                 VALUES (?1, ?2, ?3, ?4, ?5)
@ -166,18 +211,12 @@ pub mod index {
                 SET hash = excluded.hash,
                     mtime = excluded.mtime,
                     scanned_at = excluded.scanned_at",
-                params![
-                    self.project,
-                    path.to_string_lossy(),
-                    digest,
-                    mtime,
-                    scanned_at
-                ],
+                params![self.project, path_str, hash, mtime, scanned_at],
            )?;

            let id: i64 = self.c().query_row(
                "SELECT id FROM files WHERE project = ?1 AND path = ?2",
-                params![self.project, path.to_string_lossy()],
+                params![self.project, path_str],
                |r| r.get(0),
            )?;
            Ok(id)
@ -287,24 +326,38 @@ pub mod index {
        }

        /// Load every function summary for this project.
+        ///
+        /// Reads all JSON strings from SQLite in one pass, then
+        /// deserializes them in parallel with rayon for large result sets.
        pub fn load_all_summaries(&self) -> NyxResult<Vec<crate::summary::FuncSummary>> {
            let mut stmt = self
                .c()
                .prepare("SELECT summary FROM function_summaries WHERE project = ?1")?;

-            let iter = stmt.query_map([&self.project], |row| {
-                let json: String = row.get(0)?;
-                Ok(json)
-            })?;
+            let jsons: Vec<String> = stmt
+                .query_map([&self.project], |row| row.get::<_, String>(0))?
+                .filter_map(Result::ok)
+                .collect();

-            let mut out = Vec::new();
-            for row in iter {
-                let json = row?;
-                let s: crate::summary::FuncSummary = serde_json::from_str(&json)
-                    .map_err(|e| rusqlite::Error::ToSqlConversionFailure(Box::new(e)))?;
-                out.push(s);
+            // Parallel JSON deserialization for large sets
+            if jsons.len() > 256 {
+                use rayon::prelude::*;
+                let results: Vec<_> = jsons
+                    .par_iter()
+                    .filter_map(|json| {
+                        serde_json::from_str::<crate::summary::FuncSummary>(json).ok()
+                    })
+                    .collect();
+                Ok(results)
+            } else {
+                let mut out = Vec::with_capacity(jsons.len());
+                for json in &jsons {
+                    if let Ok(s) = serde_json::from_str::<crate::summary::FuncSummary>(json) {
+                        out.push(s);
+                    }
+                }
+                Ok(out)
            }
-            Ok(out)
        }

        /// gets files from the database
@ -351,12 +404,20 @@ pub mod index {
        // -------------------------------------------------------------------------
        // Helpers
        // -------------------------------------------------------------------------
+        #[allow(dead_code)] // used by should_scan() and tests
        fn digest_file(path: &Path) -> NyxResult<Vec<u8>> {
            let mut hasher = blake3::Hasher::new();
            let mut file = fs::File::open(path)?;
            std::io::copy(&mut file, &mut hasher)?;
            Ok(hasher.finalize().as_bytes().to_vec())
        }
+
+        /// Hash already-read bytes without re-reading from disk.
+        pub fn digest_bytes(bytes: &[u8]) -> Vec<u8> {
+            let mut hasher = blake3::Hasher::new();
+            hasher.update(bytes);
+            hasher.finalize().as_bytes().to_vec()
+        }
    }
 }