nyx/src/commands/index.rs
Eli Peter a438886217
Python fp and docs updtes (#58)
* refactor: Update comments for clarity and add expectations.json files for performance metrics

* feat: Implement FP guard for JS/TS local-collection receivers to suppress missing ownership checks

* feat: Enhance Rust parameter handling to classify local collections and prevent false ownership checks

* refactor: Simplify code formatting for better readability in multiple files

* refactor: Improve UTF-8 sequence length handling and enhance clarity in loop iteration

* feat: Update Java and Python patterns to include new security rules

* refactor: Improve comment clarity and consistency across multiple Rust files

* refactor: Simplify code formatting for improved readability in integration tests and module files

* refactor: Improve comment formatting and enhance clarity in assertions across multiple files
2026-04-29 19:53:34 -04:00

392 lines
13 KiB
Rust

use crate::cli::IndexAction;
use crate::database::index::{Indexer, IssueRow};
use crate::errors::NyxResult;
use crate::patterns::Severity;
use crate::server::progress::{ScanMetrics, ScanProgress, ScanStage};
use crate::server::scan_log::ScanLogCollector;
use crate::utils::Config;
use crate::utils::project::get_project_info;
use crate::walk::spawn_file_walker;
use bytesize::ByteSize;
use chrono::{DateTime, Local};
use console::style;
use indicatif::{ProgressBar, ProgressStyle};
use rayon::prelude::*;
use std::fs;
use std::path::PathBuf;
use std::process::exit;
use std::sync::Arc;
use std::sync::atomic::Ordering::Relaxed;
pub fn handle(
action: IndexAction,
database_dir: &std::path::Path,
config: &Config,
) -> NyxResult<()> {
match action {
IndexAction::Build { path, force } => {
let build_path = std::path::Path::new(&path).canonicalize()?;
let (project_name, db_path) = get_project_info(&build_path, database_dir)?;
if force || !db_path.exists() {
build_index(
&project_name,
&build_path,
&db_path,
config,
!config.output.quiet,
)?;
println!(
"{} {}",
style("Index built:").green(),
style(db_path.display()).white().bold()
);
exit(0);
} else {
println!(
"{} {}",
style("↩ Index already exists").yellow(),
style("(use --force to rebuild)").dim()
);
exit(0);
}
}
IndexAction::Status { path } => {
let status_path = std::path::Path::new(&path).canonicalize()?;
let (project_name, db_path) = get_project_info(&status_path, database_dir)?;
println!("{}", style("Index status").bold());
println!(
" {:10} {}",
style("Project").dim(),
style(&project_name).white().bold()
);
println!(
" {:10} {}",
style("Path").dim(),
style(db_path.display()).underlined()
);
if db_path.exists() {
let meta = fs::metadata(&db_path)?;
let size = ByteSize::b(meta.len());
let mtime: DateTime<Local> = meta.modified()?.into();
println!(
" {:10} {} {}",
style("Indexed").dim(),
style("").green().bold(),
style(mtime.format("%Y-%m-%d %H:%M:%S")).dim()
);
println!(" {:10} {}", style("Size").dim(), size);
} else {
println!(
" {:10} {} {}",
style("Indexed").dim(),
style("").red().bold(),
style("(run `nyx index build` to create)").dim()
);
}
exit(0);
}
}
}
pub fn build_index(
project_name: &str,
project_path: &std::path::Path,
db_path: &std::path::Path,
config: &Config,
show_progress: bool,
) -> NyxResult<()> {
build_index_with_observer(
project_name,
project_path,
db_path,
config,
show_progress,
None,
None,
None,
)
}
#[allow(clippy::too_many_arguments)]
pub fn build_index_with_observer(
project_name: &str,
project_path: &std::path::Path,
db_path: &std::path::Path,
config: &Config,
show_progress: bool,
progress: Option<&Arc<ScanProgress>>,
metrics: Option<&Arc<ScanMetrics>>,
logs: Option<&Arc<ScanLogCollector>>,
) -> NyxResult<()> {
// Pass 1 of the indexed scan reads persisted summaries produced here, so
// framework context must be populated at index-build time, otherwise
// framework-conditional label rules never contribute to the summaries
// and indexed scans diverge from non-indexed ones. Matches the
// auto-fill in scan_filesystem_with_observer /
// scan_with_index_parallel_observer.
let owned_cfg = crate::commands::scan::ensure_framework_ctx(project_path, config);
let config = owned_cfg.as_ref().unwrap_or(config);
tracing::debug!("Building index for: {}", project_name);
let pool = Indexer::init(db_path)?;
{
let idx = Indexer::from_pool(project_name, &pool)?;
idx.clear()?;
}
tracing::debug!("Cleaned index for: {}", project_name);
if let Some(p) = progress {
p.set_stage(ScanStage::Discovering);
}
if let Some(l) = logs {
l.info(
format!("Rebuilding index for {}", project_path.display()),
None,
);
}
let walk_start = std::time::Instant::now();
let (rx, handle) = spawn_file_walker(project_path, config);
// Drain the channel BEFORE joining, the bounded channel will deadlock
// if we join first and the walker blocks on send.
let paths: Vec<PathBuf> = rx.into_iter().flatten().collect();
if let Err(err) = handle.join() {
tracing::error!("walker thread panicked: {:#?}", err);
if let Some(l) = logs {
l.error(
"Walker thread panicked during index rebuild",
None,
Some(format!("{err:#?}")),
);
}
}
if let Some(p) = progress {
p.record_walk_ms(walk_start.elapsed().as_millis() as u64);
p.set_files_discovered(paths.len() as u64);
p.set_stage(ScanStage::Indexing);
}
if let Some(l) = logs {
l.info(
format!(
"Index rebuild discovered {} files in {}ms",
paths.len(),
walk_start.elapsed().as_millis()
),
None,
);
}
let pb = if show_progress {
let pb = ProgressBar::new(paths.len() as u64);
pb.set_style(
ProgressStyle::with_template(
"{spinner:.green} {msg} [{bar:30.cyan/blue}] {pos}/{len} ({eta})",
)
.unwrap()
.progress_chars("##-"),
);
pb.set_message("Indexing files");
pb
} else {
ProgressBar::hidden()
};
let progress = progress.cloned();
let metrics = metrics.cloned();
let logs = logs.cloned();
let pass1_start = std::time::Instant::now();
paths
.into_par_iter()
.try_for_each(|path| -> NyxResult<()> {
let mut idx = Indexer::from_pool(project_name, &pool)?;
// Read once, hash once, pass bytes to both rule execution and
// summary extraction. Use pre-computed hash for upsert to avoid
// a redundant file read inside upsert_file.
let bytes = std::fs::read(&path)?;
let hash = Indexer::digest_bytes(&bytes);
// Parse once and persist every artifact we can reuse later:
// findings, coarse summaries, and precise SSA summaries.
let fused = crate::commands::scan::analyse_file_fused(
&bytes,
&path,
config,
None,
Some(project_path),
)?;
if let Some(ref p) = progress {
p.inc_parsed(1);
p.set_current_file(&path.to_string_lossy());
if let Some(lang) = fused.summaries.first().map(|s| s.lang.as_str()) {
p.record_language(lang);
}
}
if let Some(ref m) = metrics {
m.cfg_nodes.fetch_add(fused.cfg_nodes as u64, Relaxed);
}
let file_id = idx.upsert_file_with_hash(&path, &hash)?;
let rows: Vec<IssueRow> = fused
.diags
.iter()
.map(|d| IssueRow {
rule_id: d.id.as_ref(),
severity: match d.severity {
Severity::High => "HIGH",
Severity::Medium => "MEDIUM",
Severity::Low => "LOW",
},
line: d.line as i64,
col: d.col as i64,
})
.collect();
idx.replace_issues(file_id, rows)?;
if !fused.summaries.is_empty() {
idx.replace_summaries_for_file(&path, &hash, &fused.summaries)?;
}
if !fused.ssa_summaries.is_empty() {
let ssa_rows: Vec<_> = fused
.ssa_summaries
.into_iter()
.map(|(key, sum)| {
(
key.name,
key.arity.unwrap_or(0),
key.lang.as_str().to_string(),
key.namespace,
key.container,
key.disambig,
key.kind,
sum,
)
})
.collect();
idx.replace_ssa_summaries_for_file(&path, &hash, &ssa_rows)?;
}
// Persist SSA callee bodies at index-build time so CLI-initiated
// rebuilds (`--index rebuild`) populate the same
// `ssa_function_bodies` rows that `scan_with_index_parallel`
// would have written via its pass-1 branch. Without this,
// indexed scans load zero cross-file bodies and cross-file
// inline silently falls back to summary resolution.
if !fused.ssa_bodies.is_empty() {
let body_rows: Vec<_> = fused
.ssa_bodies
.into_iter()
.map(|(key, body)| {
(
key.name,
key.arity.unwrap_or(0),
key.lang.as_str().to_string(),
key.namespace,
key.container,
key.disambig,
key.kind,
body,
)
})
.collect();
idx.replace_ssa_bodies_for_file(&path, &hash, &body_rows)?;
}
pb.inc(1);
Ok(())
})?;
pb.finish_and_clear();
if let Some(p) = &progress {
p.record_pass1_ms(pass1_start.elapsed().as_millis() as u64);
}
if let Some(l) = &logs {
l.info(
format!(
"Index rebuild complete in {}ms",
pass1_start.elapsed().as_millis()
),
None,
);
}
{
let idx = Indexer::from_pool(project_name, &pool)?;
idx.vacuum()?;
}
Ok(())
}
#[test]
fn build_index_creates_db_and_registers_files() {
let mut cfg = Config::default();
cfg.performance.worker_threads = Some(1);
cfg.performance.channel_multiplier = 1;
cfg.performance.batch_size = 2;
let td = tempfile::tempdir().unwrap();
let project_dir = td.path().join("proj");
fs::create_dir(&project_dir).unwrap();
let f_txt = project_dir.join("readme.txt");
fs::write(&f_txt, "hello").unwrap();
let db_path = td.path().join("proj.sqlite");
build_index("proj", &project_dir, &db_path, &cfg, false).expect("index build should succeed");
// ── Assert ────────────────────────────────────────────────────────────────
assert!(db_path.is_file(), "SQLite file must exist");
let pool = Indexer::init(&db_path).unwrap();
let idx = Indexer::from_pool("proj", &pool).unwrap();
let files = idx.get_files("proj").unwrap();
assert_eq!(files.len(), 1, "exactly one file indexed");
assert_eq!(files[0], f_txt);
}
#[test]
fn build_index_persists_ssa_summaries() {
let mut cfg = Config::default();
cfg.performance.worker_threads = Some(1);
cfg.performance.channel_multiplier = 1;
cfg.performance.batch_size = 2;
let td = tempfile::tempdir().unwrap();
let project_dir = td.path().join("proj");
fs::create_dir(&project_dir).unwrap();
fs::write(
project_dir.join("app.js"),
r#"var express = require('express');
var app = express();
function cleanHtml(input) {
return DOMPurify.sanitize(input);
}
app.get('/safe', function(req, res) {
var name = req.query.name;
var safe = cleanHtml(name);
res.send(safe);
});
"#,
)
.unwrap();
let db_path = td.path().join("proj.sqlite");
build_index("proj", &project_dir, &db_path, &cfg, false).expect("index build should succeed");
let pool = Indexer::init(&db_path).unwrap();
let idx = Indexer::from_pool("proj", &pool).unwrap();
let ssa = idx.load_all_ssa_summaries().unwrap();
assert!(
!ssa.is_empty(),
"index build should persist SSA summaries for functions with non-trivial SSA effects"
);
}