Refactor database schema and scanning process:

- Introduced `issues` table for detailed vulnerability storage.
- Enhanced `files` table with project scoping and unique constraints.
- Replaced `OutputFormat` enum with `String` for flexibility.
- Added support for formatted console output of scan results.
- Integrated file and issue updating logic for incremental scans.
- Optimized scanning by leveraging database-stored issues.
This commit is contained in:
elipeter 2025-06-17 16:46:45 +02:00
parent 9ef591c7b1
commit 0eecf886f2
7 changed files with 302 additions and 357 deletions

View file

@ -26,8 +26,8 @@ pub enum Commands {
rebuild_index: bool,
/// Output format
#[arg(short, long, value_enum, default_value = "table")]
format: OutputFormat,
#[arg(short, long, value_enum, default_value = "")]
format: String,
/// Show only high severity issues
#[arg(long)]
@ -78,11 +78,3 @@ pub enum IndexAction {
path: String,
},
}
#[derive(clap::ValueEnum, Clone, Debug)]
pub enum OutputFormat {
Table,
Json,
Csv,
Sarif,
}

View file

@ -1,18 +1,30 @@
use crate::cli::OutputFormat;
use crate::utils::project::get_project_info;
use console::style;
use std::path::Path;
use crate::utils::config::Config;
use tree_sitter::{Language, Parser, QueryCursor, StreamingIterator};
use crate::database::index::Indexer;
use crate::database::index::{IssueRow, Indexer};
use crate::patterns::Severity;
use crate::utils::config::Config;
use crate::utils::query_cache;
use crate::walk::spawn_senders;
use tree_sitter::{Language, Parser, QueryCursor, StreamingIterator};
#[derive(Debug)]
pub struct Diag {
pub(crate) path: String,
pub(crate) line: usize,
pub(crate) col: usize,
pub(crate) severity: Severity,
pub(crate) id: String,
}
/// Entry point called by the CLI.
pub fn handle(
path: &str,
no_index: bool,
rebuild_index: bool,
format: OutputFormat,
format: String,
high_only: bool,
database_dir: &Path,
config: &Config,
@ -20,73 +32,111 @@ pub fn handle(
let scan_path = Path::new(path).canonicalize()?;
let (project_name, db_path) = get_project_info(&scan_path, database_dir)?;
tracing::debug!("Config: {:?}", config);
tracing::debug!("Scanning project: {}", project_name);
tracing::debug!("Scan path: {}", scan_path.display());
let mut indexer = Indexer::new(&project_name, &db_path)?;
let diags: Vec<Diag>;
if no_index {
tracing::debug!("Scanning without index...");
scan_filesystem(&scan_path, config)?;
diags = scan_filesystem(&scan_path, config)?;
} else {
if rebuild_index || !db_path.exists() {
tracing::debug!("Building/updating index...");
crate::commands::index::build_index(&scan_path, &db_path)?;
}
tracing::debug!("Using index: {}", db_path.display());
scan_with_index(&scan_path, &db_path, config)?;
diags = scan_with_index(&project_name, &db_path, config, &mut indexer)?;
}
tracing::debug!("Output format: {:?}", format);
if high_only {
tracing::debug!("Filtering: High severity only");
}
Ok(())
}
fn scan_filesystem(root: &Path, cfg: &Config) -> Result<(), Box<dyn std::error::Error>> {
let rx = spawn_senders(root, cfg);
for batch in rx.iter().flatten() {
tracing::debug!("Scanning file: {}", batch.display());
scan_single_file(&batch, cfg)?; // <-- your actual scanner
}
Ok(())
}
fn scan_with_index(root: &Path, db_path: &Path, cfg: &Config) -> Result<(), Box<dyn std::error::Error>> {
let indexer = Indexer::new(db_path)
.map_err(|e| format!("opening index {}: {e}", db_path.display()))?;
let rx = spawn_senders(root, cfg);
for batch in rx.iter().flatten() {
let scan = indexer.should_scan(&batch)?;
tracing::debug!("Should scan: {}, file: {}", scan, batch.display());
if scan {
tracing::debug!("Scanning file: {}", batch.display());
scan_single_file(&batch, cfg)?; // your scanner
indexer.record_scan(&batch)?;
if format == "console" || format == "" && config.output.default_format == "console" {
for d in &diags {
if high_only && d.severity != Severity::High {
continue;
}
let sev_str = match d.severity {
Severity::High => style("HIGH").red().bold(),
Severity::Medium => style("MEDIUM").yellow().bold(),
Severity::Low => style("LOW").cyan().bold(),
};
println!(
"{}:{}:{} [{}] {}",
style(d.path.clone()).blue().underlined(),
d.line,
d.col,
sev_str,
style(&d.id).bold(),
);
}
}
Ok(())
}
fn scan_single_file(
// --------------------------------------------------------------------------------------------
// Scanning helpers
// --------------------------------------------------------------------------------------------
fn scan_filesystem(
root: &Path,
cfg: &Config,
) -> Result<Vec<Diag>, Box<dyn std::error::Error>> {
let rx = spawn_senders(root, cfg);
let mut issues: Vec<Diag> = Vec::new();
for batch in rx.iter().flatten() {
issues.append(&mut run_rules_on_file(&batch, cfg)?);
}
Ok(issues)
}
fn scan_with_index(
project: &str,
_db_path: &Path,
cfg: &Config,
indexer: &mut Indexer,
) -> Result<Vec<Diag>, Box<dyn std::error::Error>> {
let files = indexer.get_files(project).unwrap_or_default();
let mut issues: Vec<Diag> = Vec::new();
for file in files {
if indexer.should_scan(&file)? {
let mut diags = run_rules_on_file(&file, cfg)?;
let file_id = indexer.upsert_file(&file)?;
let issue_rows: Vec<IssueRow> = diags
.iter()
.map(|d| IssueRow {
rule_id: d.id.as_ref(),
severity: match d.severity {
Severity::High => "HIGH",
Severity::Medium => "MEDIUM",
Severity::Low => "LOW",
},
line: d.line as i64,
col: d.col as i64,
})
.collect();
indexer.replace_issues(file_id, issue_rows)?;
issues.append(&mut diags);
continue;
}
issues.append(&mut indexer.get_issues_from_file(&file)?);
}
Ok(issues)
}
// --------------------------------------------------------------------------------------------
// Treesitterbased rule runner returns a Vec<Diag>
// --------------------------------------------------------------------------------------------
fn run_rules_on_file(
path: &Path,
cfg: &Config, // assume cfg.high_only: bool
) -> Result<(), Box<dyn std::error::Error>> {
cfg: &Config,
) -> Result<Vec<Diag>, Box<dyn std::error::Error>> {
let source = std::fs::read_to_string(path)?;
let mut parser = Parser::new();
let ext = path
.extension()
.and_then(|s| s.to_str())
.unwrap_or_default()
.to_ascii_lowercase();
// Pick the right tree-sitter language *and* pre-compiled queries
let (ts_lang, lang_key): (Language, &'static str) = match ext.as_str() {
let lang_key = match path
.extension()
.and_then(|s| s.to_str())
.unwrap_or_default()
.to_ascii_lowercase()
.as_str()
{
"rs" => (Language::from(tree_sitter_rust::LANGUAGE), "rust"),
"c" => (Language::from(tree_sitter_c::LANGUAGE), "c"),
"cpp" | "c++" => (Language::from(tree_sitter_cpp::LANGUAGE), "cpp"),
@ -96,69 +146,35 @@ fn scan_single_file(
"py" => (Language::from(tree_sitter_python::LANGUAGE), "python"),
"ts" | "tsx" => (Language::from(tree_sitter_typescript::LANGUAGE_TYPESCRIPT), "typescript"),
"js" => (Language::from(tree_sitter_javascript::LANGUAGE), "javascript"),
_ => return Ok(()),
_ => return Ok(Vec::new()),
};
let (ts_lang, lang_name) = lang_key;
parser.set_language(&ts_lang)?;
let tree = parser.parse(&source, None).ok_or("treesitter failed")?;
let root = tree.root_node();
let tree = parser.parse(&source, None).ok_or("tree-sitter failed")?;
let root = tree.root_node();
// ----- run vulnerability patterns -----
let compiled = query_cache::for_lang(lang_key, ts_lang);
let compiled = query_cache::for_lang(lang_name, ts_lang);
let mut cursor = QueryCursor::new();
let mut out = Vec::new();
for cq in &compiled {
if cfg.scanner.min_severity > cq.meta.severity {
continue;
continue;
}
let mut matches = cursor.matches(&cq.query, root, source.as_bytes());
while let Some(m) = matches.next() {
// capture 0 is the one tagged @vuln
for cap in m.captures.iter().filter(|c| c.index == 0) {
let point = cap.node.start_position();
let line = point.row;
let col = point.column;
match cq.meta.severity {
Severity::High => {
tracing::error!(
file = %path.display(),
line = line + 1,
column = col + 1,
id = cq.meta.id,
sev = ?Severity::High,
"pattern matched"
);
},
Severity::Medium => {
tracing::warn!(
file = %path.display(),
line = line + 1,
column = col + 1,
id = cq.meta.id,
sev = ?Severity::Medium,
"pattern matched"
);
}
Severity::Low => {
tracing::info!(
file = %path.display(),
line = line + 1,
column = col + 1,
id = cq.meta.id,
sev = ?Severity::Low,
"pattern matched"
);
}
}
out.push(Diag {
path: path.to_string_lossy().to_string(),
line: point.row + 1,
col: point.column + 1,
severity: cq.meta.severity,
id: String::from(cq.meta.id),
});
}
}
}
Ok(())
Ok(out)
}

View file

@ -1,164 +1,174 @@
pub mod index {
use blake3::Hasher;
use rusqlite::{params, Connection, OptionalExtension};
use std::fs;
use std::path::Path;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use std::time::{SystemTime, UNIX_EPOCH};
use crate::commands::scan::Diag;
use crate::patterns::Severity;
/// Schema: stores digest, file modification time (secs since epoch) and
/// last time we *fully* scanned the file.
/// DB schema (foreignkeys enabled).
const SCHEMA: &str = r#"
CREATE TABLE IF NOT EXISTS files (
path TEXT PRIMARY KEY,
hash BLOB NOT NULL,
mtime INTEGER NOT NULL,
scanned_at INTEGER NOT NULL
);"#;
PRAGMA foreign_keys = ON;
pub(crate) struct Indexer {
CREATE TABLE IF NOT EXISTS files (
id INTEGER PRIMARY KEY AUTOINCREMENT,
project TEXT NOT NULL,
path TEXT NOT NULL,
hash BLOB NOT NULL,
mtime INTEGER NOT NULL,
scanned_at INTEGER NOT NULL,
UNIQUE(project, path)
);
CREATE TABLE IF NOT EXISTS issues (
file_id INTEGER NOT NULL
REFERENCES files(id)
ON DELETE CASCADE,
rule_id TEXT NOT NULL,
severity TEXT NOT NULL,
line INTEGER NOT NULL,
col INTEGER NOT NULL,
PRIMARY KEY (file_id, rule_id, line, col)
);
"#;
/// A single issue row, ready for insertion.
#[derive(Debug, Clone)]
pub struct IssueRow<'a> {
pub rule_id: &'a str,
pub severity: &'a str,
pub line: i64,
pub col: i64,
}
pub struct Indexer {
conn: Connection,
project: String,
}
impl Indexer {
pub fn new(database_path: &Path) -> Result<Self, Box<dyn std::error::Error>> {
/// Open (or create) the DB at `database_path` for the given project name.
pub fn new(project: &str, database_path: &Path) -> Result<Self, Box<dyn std::error::Error>> {
let conn = Connection::open(database_path)?;
conn.execute_batch(SCHEMA)?;
Ok(Self { conn })
Ok(Self { conn, project: project.to_owned() })
}
/// Returns `true` if the caller should analyze the file, i.e., we have
/// never seen it or something changed (mtime or content hash).
/// Return true when the file *content* or *mtime* changed since the last scan.
pub fn should_scan(&self, path: &Path) -> Result<bool, Box<dyn std::error::Error>> {
let meta = fs::metadata(path)?;
let mtime = meta.modified()?.duration_since(UNIX_EPOCH)?.as_secs() as i64;
let digest = Self::digest_file(path)?;
let row: Option<(Vec<u8>, i64)> = self
.conn
.query_row(
"SELECT hash, mtime FROM files WHERE path = ?1",
params![path.to_string_lossy()],
|r| Ok((r.get(0)?, r.get(1)?)),
)
.optional()?;
.conn
.query_row(
"SELECT hash, mtime FROM files WHERE project = ?1 AND path = ?2",
params![self.project, path.to_string_lossy()],
|r| Ok((r.get(0)?, r.get(1)?)),
)
.optional()?;
match row {
Some((stored_hash, stored_mtime)) => {
Ok(stored_hash != digest || stored_mtime != mtime)
}
None => Ok(true),
}
Ok(match row {
Some((stored_hash, stored_mtime)) => stored_hash != digest || stored_mtime != mtime,
None => true,
})
}
/// Persist a fresh scan result.
pub fn record_scan(&self, path: &Path) -> Result<(), Box<dyn std::error::Error>> {
/// Insert or update the `files` row and return its id.
pub fn upsert_file(&self, path: &Path) -> Result<i64, Box<dyn std::error::Error>> {
let meta = fs::metadata(path)?;
let mtime = meta.modified()?.duration_since(UNIX_EPOCH)?.as_secs() as i64;
let scanned_at = SystemTime::now()
.duration_since(UNIX_EPOCH)?
.as_secs() as i64;
let scanned_at = SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs() as i64;
let digest = Self::digest_file(path)?;
self.conn.execute(
"REPLACE INTO files (path, hash, mtime, scanned_at) VALUES (?1, ?2, ?3, ?4)",
params![path.to_string_lossy(), digest, mtime, scanned_at],
"INSERT INTO files (project, path, hash, mtime, scanned_at)
VALUES (?1, ?2, ?3, ?4, ?5)
ON CONFLICT(project,path) DO UPDATE
SET hash = excluded.hash,
mtime = excluded.mtime,
scanned_at = excluded.scanned_at",
params![self.project, path.to_string_lossy(), digest, mtime, scanned_at],
)?;
let id: i64 = self.conn.query_row(
"SELECT id FROM files WHERE project = ?1 AND path = ?2",
params![self.project, path.to_string_lossy()],
|r| r.get(0),
)?;
Ok(id)
}
/// Replace all issues for `file_id` with the supplied set.
pub fn replace_issues<'a>(&mut self, file_id: i64, issues: impl IntoIterator<Item = IssueRow<'a>>)
-> Result<(), Box<dyn std::error::Error>> {
let tx = self.conn.transaction()?;
tx.execute("DELETE FROM issues WHERE file_id = ?", params![file_id])?;
{
let mut stmt = tx.prepare(
"INSERT INTO issues (file_id, rule_id, severity, line, col)
VALUES (?1, ?2, ?3, ?4, ?5)",
)?;
for iss in issues {
stmt.execute(params![file_id, iss.rule_id, iss.severity, iss.line, iss.col])?;
}
}
tx.commit()?;
Ok(())
}
/// Gets the issues for a specific file so we don't have to rescan
pub fn get_issues_from_file(
&self,
path: &Path,
) -> Result<Vec<Diag>, Box<dyn std::error::Error>> {
let file_id: i64 = self.conn.query_row(
"SELECT id FROM files WHERE project = ?1 AND path = ?2",
params![self.project, path.to_string_lossy()],
|r| r.get(0),
)?;
let mut stmt = self.conn.prepare(
"SELECT rule_id, severity, line, col
FROM issues
WHERE file_id = ?1",
)?;
let issue_iter = stmt.query_map([file_id], |row| {
let sev_str: String = row.get(1)?;
Ok(Diag {
path: path.to_string_lossy().to_string(),
id: row.get::<_, String>(0)?, // rule_id
line: row.get::<_, i64>(2)? as usize,
col: row.get::<_, i64>(3)? as usize,
severity: Severity::from_str(&sev_str).unwrap(),
})
})?;
Ok(issue_iter.filter_map(Result::ok).collect())
}
/// gets files from the database
pub fn get_files(&self, project: &str) -> Result<Vec<std::path::PathBuf>, Box<dyn std::error::Error>> {
let mut stmt = self.conn.prepare(
"SELECT path
FROM files
WHERE project = ?1",
)?;
let file_iter = stmt.query_map([project], |row| row.get::<_, String>(0))?;
Ok(file_iter.map(|p| p.map(PathBuf::from)).collect::<Result<_, _>>()?)
}
fn digest_file(path: &Path) -> Result<Vec<u8>, Box<dyn std::error::Error>> {
let mut hasher = Hasher::new();
let mut hasher = blake3::Hasher::new();
let mut file = fs::File::open(path)?;
std::io::copy(&mut file, &mut hasher)?;
Ok(hasher.finalize().as_bytes().to_vec())
}
}
}
#[cfg(test)]
mod tests {
use crate::database::index::Indexer;
use std::error::Error;
use std::io::Write;
use tempfile::tempdir;
/// Returns a freshlyinitialised `Indexer` backed by an *inmemory* SQLite
/// database. Using `:memory:` sidesteps filesystem lifetime issues that can
/// occur when the temporary database file is deleted while a connection is
/// still open.
fn new_indexer() -> Indexer {
Indexer::new(std::path::Path::new(":memory:"))
.expect("create inmemory Indexer")
}
#[test]
fn new_file_is_flagged_for_scan() -> Result<(), Box<dyn Error>> {
let indexer = new_indexer();
let dir = tempdir()?;
let file_path = dir.path().join("hello.txt");
std::fs::write(&file_path, b"hello world")?;
// File has never been seen ⇒ should be scanned.
assert!(indexer.should_scan(&file_path)?);
Ok(())
}
#[test]
fn unchanged_file_is_not_flagged_again() -> Result<(), Box<dyn Error>> {
let indexer = new_indexer();
let dir = tempdir()?;
let file_path = dir.path().join("foo.txt");
std::fs::write(&file_path, b"abc123")?;
// First pass record the scan result.
indexer.record_scan(&file_path)?;
// Nothing changed should_scan must return false.
assert!(!indexer.should_scan(&file_path)?);
Ok(())
}
#[test]
fn modified_content_triggers_rescan() -> Result<(), Box<dyn Error>> {
let indexer = new_indexer();
let dir = tempdir()?;
let file_path = dir.path().join("bar.txt");
std::fs::write(&file_path, b"first")?;
indexer.record_scan(&file_path)?;
// Append data to change the hash.
let mut file = std::fs::OpenOptions::new()
.append(true)
.open(&file_path)?;
writeln!(file, "second line")?;
assert!(indexer.should_scan(&file_path)?);
Ok(())
}
#[test]
fn modified_mtime_alone_triggers_rescan() -> Result<(), Box<dyn Error>> {
// Compile this test only when the optional `filetime` feature is enabled.
{
use std::time::{Duration, SystemTime};
use filetime::FileTime;
let indexer = new_indexer();
let dir = tempdir()?;
let file_path = dir.path().join("baz.txt");
std::fs::write(&file_path, b"unchanged content")?;
indexer.record_scan(&file_path)?;
// Bump the modification time without touching the contents.
let now_plus = SystemTime::now() + Duration::from_secs(5);
let new_mtime = FileTime::from_system_time(now_plus);
filetime::set_file_mtime(&file_path, new_mtime)?;
assert!(indexer.should_scan(&file_path)?);
}
Ok(())
}
}

View file

@ -9,6 +9,7 @@ mod php;
mod python;
use std::collections::HashMap;
use std::str::FromStr;
use serde::{Deserialize, Serialize};
use once_cell::sync::Lazy;
@ -33,6 +34,19 @@ pub struct Pattern {
pub severity: Severity,
}
impl FromStr for Severity { // TODO: FIX
type Err = ();
fn from_str(input: &str) -> Result<Self, Self::Err> {
match input.to_lowercase().as_str() {
"medium" => Ok(Severity::Medium),
"high" => Ok(Severity::High),
_ => Ok(Severity::Low),
}
}
}
/// Global, lazily-initialised registry: lang-name → pattern slice
static REGISTRY: Lazy<HashMap<&'static str, &'static [Pattern]>> = Lazy::new(|| {
let mut m = HashMap::new();

View file

@ -97,13 +97,13 @@ impl Default for DatabaseConfig {
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(default)]
pub struct OutputConfig {
/// The default output format. TODO: IMPLEMENT
/// The default output format. TODO: IMPLEMENT others
pub default_format: String,
/// Whether to show progress or not. TODO: IMPLEMENT
pub show_progress: bool,
/// Whether to colorize output or not. TODO: IMPLEMENT
/// Whether to colorize output or not. TODO: IMPLEMENT changing to non colored
pub color_output: bool,
/// The maximum number of results to show. TODO: IMPLEMENT
@ -113,7 +113,7 @@ pub struct OutputConfig {
impl Default for OutputConfig {
fn default() -> Self {
Self {
default_format: "table".into(),
default_format: "console".into(),
show_progress: true,
color_output: true,
max_results: None,