Feat/configurable sanitizers and js precision (#32)

* chore: Exclude CLAUDE.md from Cargo.toml

* feat: Add configurable analysis rules and CLI commands for custom sanitizers and terminators

* feat: Enhance resource management and analysis efficiency

- Implemented parallel summary merging in `scan_filesystem` using rayon for improved performance.
- Introduced `GlobalSummaries::merge()` for efficient merging of summaries.
- Optimized file reading and hashing to eliminate redundant I/O operations.
- Added `should_scan_with_hash()` and `upsert_file_with_hash()` methods to streamline file processing.
- Enhanced taint analysis with in-place mutations to reduce memory allocations.
- Updated resource acquisition patterns to exclude false positives for `freopen` and wrapper functions.

* feat: Implement severity downgrade for findings in non-production paths and add source kind inference

* feat: Update versioning information in SECURITY.md for new stable line

* feat: Update categories in Cargo.toml to include parser-implementations and text-processing

* feat: Update dependencies in Cargo.lock for improved compatibility and performance

* feat: Update dependencies in Cargo.lock and Cargo.toml for improved compatibility
This commit is contained in:
Eli Peter 2026-02-25 04:02:11 -05:00 committed by GitHub
parent f96a89e7c1
commit 19b578c5c4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
37 changed files with 3775 additions and 432 deletions

View file

@ -2,6 +2,7 @@ use crate::cfg::{build_cfg, export_summaries};
use crate::cfg_analysis;
use crate::commands::scan::Diag;
use crate::errors::{NyxError, NyxResult};
use crate::labels::{build_lang_rules, severity_for_source_kind};
use crate::patterns::Severity;
use crate::summary::{FuncSummary, GlobalSummaries};
use crate::symbol::{Lang, normalize_namespace};
@ -53,6 +54,53 @@ fn is_binary(bytes: &[u8]) -> bool {
bytes.iter().filter(|b| **b == 0).count() * 100 / bytes.len().max(1) > 1
}
/// Check if a file path belongs to a non-production context (tests, vendor,
/// benchmarks, etc.). Used to downgrade severity for findings in paths that
/// are unlikely to represent attack surface.
fn is_nonprod_path(path: &Path) -> bool {
static NONPROD_DIRS: &[&str] = &[
"tests",
"test",
"__tests__",
"benches",
"benchmarks",
"examples",
"build",
"scripts",
"docs",
"js_tests",
"fixtures",
"vendor",
];
static NONPROD_FILES: &[&str] = &["build.rs"];
if let Some(name) = path.file_name().and_then(|n| n.to_str())
&& (NONPROD_FILES.contains(&name) || name.ends_with(".min.js"))
{
return true;
}
for component in path.components() {
if let std::path::Component::Normal(c) = component
&& let Some(s) = c.to_str()
&& NONPROD_DIRS.contains(&s)
{
return true;
}
}
false
}
/// Downgrade severity by one tier: High→Medium, Medium→Low, Low→Low.
fn downgrade_severity(s: Severity) -> Severity {
match s {
Severity::High => Severity::Medium,
Severity::Medium => Severity::Low,
Severity::Low => Severity::Low,
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Pass 1: Extract function summaries (no taint analysis)
// ─────────────────────────────────────────────────────────────────────────────
@ -84,7 +132,17 @@ pub fn extract_summaries_from_bytes(
})?;
let file_path_str = path.to_string_lossy();
let (_cfg_graph, _entry, local_summaries) = build_cfg(&tree, bytes, lang_slug, &file_path_str);
let lang_rules = build_lang_rules(_cfg, lang_slug);
let rules_ref = if lang_rules.extra_labels.is_empty()
&& lang_rules.terminators.is_empty()
&& lang_rules.event_handlers.is_empty()
{
None
} else {
Some(&lang_rules)
};
let (_cfg_graph, _entry, local_summaries) =
build_cfg(&tree, bytes, lang_slug, &file_path_str, rules_ref);
Ok(export_summaries(
&local_summaries,
@ -95,6 +153,7 @@ pub fn extract_summaries_from_bytes(
/// Convenience wrapper that reads the file then delegates to
/// [`extract_summaries_from_bytes`].
#[allow(dead_code)] // used by benchmarks and lib consumers
pub fn extract_summaries_from_file(path: &Path, cfg: &Config) -> NyxResult<Vec<FuncSummary>> {
let bytes = std::fs::read(path)?;
extract_summaries_from_bytes(&bytes, path, cfg)
@ -142,7 +201,17 @@ pub fn run_rules_on_bytes(
if needs_cfg {
// Build CFG — needed for both taint analysis and CFG structural analyses.
let (cfg_graph, entry, summaries) = build_cfg(&_tree, bytes, lang_slug, &file_path_str);
let lang_rules = build_lang_rules(cfg, lang_slug);
let rules_ref = if lang_rules.extra_labels.is_empty()
&& lang_rules.terminators.is_empty()
&& lang_rules.event_handlers.is_empty()
{
None
} else {
Some(&lang_rules)
};
let (cfg_graph, entry, summaries) =
build_cfg(&_tree, bytes, lang_slug, &file_path_str, rules_ref);
let caller_lang = Lang::from_slug(lang_slug).unwrap_or(Lang::Rust);
// ── Taint analysis ──────────────────────────────────────────────
@ -174,7 +243,7 @@ pub fn run_rules_on_bytes(
path: path.to_string_lossy().into_owned(),
line: sink_point.row + 1,
col: sink_point.column + 1,
severity: Severity::High,
severity: severity_for_source_kind(finding.source_kind),
id: format!(
"taint-unsanitised-flow (source {}:{})",
source_point.row + 1,
@ -184,6 +253,7 @@ pub fn run_rules_on_bytes(
}
// ── CFG structural analyses ─────────────────────────────────────
let taint_active = global_summaries.is_some() || !taint_results.is_empty();
let cfg_ctx = cfg_analysis::AnalysisContext {
cfg: &cfg_graph,
entry,
@ -193,6 +263,8 @@ pub fn run_rules_on_bytes(
func_summaries: &summaries,
global_summaries,
taint_findings: &taint_results,
analysis_rules: rules_ref,
taint_active,
};
for cf in cfg_analysis::run_all(&cfg_ctx) {
let point = byte_offset_to_point(&_tree, cf.span.0);
@ -238,6 +310,13 @@ pub fn run_rules_on_bytes(
a.line == b.line && a.col == b.col && a.id == b.id && a.severity == b.severity
});
// Downgrade severity for non-production paths unless opted out
if !cfg.scanner.include_nonprod && is_nonprod_path(path) {
for d in &mut out {
d.severity = downgrade_severity(d.severity);
}
}
Ok(out)
}
@ -253,6 +332,184 @@ pub fn run_rules_on_file(
run_rules_on_bytes(&bytes, path, cfg, global_summaries, scan_root)
}
// ─────────────────────────────────────────────────────────────────────────────
// Fused single-pass: extract summaries + run full analysis in one parse/CFG
// ─────────────────────────────────────────────────────────────────────────────
/// Result of a fused analysis pass: both function summaries and diagnostics.
pub struct FusedResult {
pub summaries: Vec<FuncSummary>,
pub diags: Vec<Diag>,
}
/// Parse the file once, build the CFG once, and produce both function
/// summaries (for cross-file resolution) and full diagnostics (AST queries +
/// taint + CFG structural analyses).
///
/// When `global_summaries` is `None`, the taint engine runs with local
/// context only (equivalent to pass 1 + partial pass 2). A second call
/// to [`run_taint_only`] can refine findings with the full cross-file view
/// without re-parsing or re-building the CFG.
pub fn analyse_file_fused(
bytes: &[u8],
path: &Path,
cfg: &Config,
global_summaries: Option<&GlobalSummaries>,
scan_root: Option<&Path>,
) -> NyxResult<FusedResult> {
let _span = tracing::debug_span!("analyse_fused", file = %path.display()).entered();
if is_binary(bytes) {
return Ok(FusedResult {
summaries: vec![],
diags: vec![],
});
}
let Some((ts_lang, lang_slug)) = lang_for_path(path) else {
return Ok(FusedResult {
summaries: vec![],
diags: vec![],
});
};
let tree = PARSER.with(|cell| {
let mut parser = cell.borrow_mut();
parser.set_language(&ts_lang)?;
parser
.parse(bytes, None)
.ok_or_else(|| NyxError::Other("tree-sitter failed".into()))
})?;
let file_path_str = path.to_string_lossy();
// Build language-specific analysis rules once
let lang_rules = build_lang_rules(cfg, lang_slug);
let rules_ref = if lang_rules.extra_labels.is_empty()
&& lang_rules.terminators.is_empty()
&& lang_rules.event_handlers.is_empty()
{
None
} else {
Some(&lang_rules)
};
// Build CFG once — used for both summary extraction AND analysis
let (cfg_graph, entry, local_summaries) =
build_cfg(&tree, bytes, lang_slug, &file_path_str, rules_ref);
// Export summaries (always — needed for cross-file merging)
let summaries = export_summaries(&local_summaries, &file_path_str, lang_slug);
let mut out = Vec::new();
// Taint + CFG structural analyses
let needs_cfg =
cfg.scanner.mode == AnalysisMode::Full || cfg.scanner.mode == AnalysisMode::Taint;
if needs_cfg {
let caller_lang = Lang::from_slug(lang_slug).unwrap_or(Lang::Rust);
let scan_root_str = scan_root.map(|p| p.to_string_lossy());
let namespace = normalize_namespace(&file_path_str, scan_root_str.as_deref());
let taint_results = analyse_file(
&cfg_graph,
entry,
&local_summaries,
global_summaries,
caller_lang,
&namespace,
&[],
);
for finding in &taint_results {
let sink_byte = cfg_graph[finding.sink].span.0;
let sink_point = byte_offset_to_point(&tree, sink_byte);
let source_byte = cfg_graph[finding.source].span.0;
let source_point = byte_offset_to_point(&tree, source_byte);
out.push(Diag {
path: path.to_string_lossy().into_owned(),
line: sink_point.row + 1,
col: sink_point.column + 1,
severity: severity_for_source_kind(finding.source_kind),
id: format!(
"taint-unsanitised-flow (source {}:{})",
source_point.row + 1,
source_point.column + 1
),
});
}
let taint_active = global_summaries.is_some() || !taint_results.is_empty();
let cfg_ctx = cfg_analysis::AnalysisContext {
cfg: &cfg_graph,
entry,
lang: caller_lang,
file_path: &file_path_str,
source_bytes: bytes,
func_summaries: &local_summaries,
global_summaries,
taint_findings: &taint_results,
analysis_rules: rules_ref,
taint_active,
};
for cf in cfg_analysis::run_all(&cfg_ctx) {
let point = byte_offset_to_point(&tree, cf.span.0);
out.push(Diag {
path: path.to_string_lossy().into_owned(),
line: point.row + 1,
col: point.column + 1,
severity: cf.severity,
id: cf.rule_id,
});
}
}
// AST pattern queries
if cfg.scanner.mode == AnalysisMode::Full || cfg.scanner.mode == AnalysisMode::Ast {
let root = tree.root_node();
let compiled = query_cache::for_lang(lang_slug, ts_lang);
let mut cursor = QueryCursor::new();
for cq in compiled.iter() {
if cfg.scanner.min_severity <= cq.meta.severity {
continue;
}
let mut matches = cursor.matches(&cq.query, root, bytes);
while let Some(m) = matches.next() {
if let Some(cap) = m.captures.iter().find(|c| c.index == 0) {
let point = cap.node.start_position();
out.push(Diag {
path: path.to_string_lossy().into_owned(),
line: point.row + 1,
col: point.column + 1,
severity: cq.meta.severity,
id: cq.meta.id.to_owned(),
});
}
}
}
}
// Dedup
out.sort_by(|a, b| (a.line, a.col, &a.id, a.severity).cmp(&(b.line, b.col, &b.id, b.severity)));
out.dedup_by(|a, b| {
a.line == b.line && a.col == b.col && a.id == b.id && a.severity == b.severity
});
// Downgrade severity for non-production paths unless opted out
if !cfg.scanner.include_nonprod && is_nonprod_path(path) {
for d in &mut out {
d.severity = downgrade_severity(d.severity);
}
}
Ok(FusedResult {
summaries,
diags: out,
})
}
#[test]
fn unknown_extension_returns_empty() {
let dir = tempfile::tempdir().unwrap();
@ -279,3 +536,65 @@ fn binary_file_guard_triggers() {
let diags = run_rules_on_file(&bin, &Config::default(), None, None).unwrap();
assert!(diags.is_empty(), "binary files are skipped");
}
#[test]
fn nonprod_path_detection() {
// Test that is_nonprod_path recognises common non-production paths
assert!(is_nonprod_path(Path::new("project/tests/test_main.py")));
assert!(is_nonprod_path(Path::new("src/__tests__/foo.js")));
assert!(is_nonprod_path(Path::new("benches/bench.rs")));
assert!(is_nonprod_path(Path::new("vendor/lib/foo.py")));
assert!(is_nonprod_path(Path::new("src/build.rs")));
assert!(is_nonprod_path(Path::new("dist/app.min.js")));
assert!(is_nonprod_path(Path::new("examples/demo.py")));
assert!(is_nonprod_path(Path::new("fixtures/data.json")));
// Should NOT match production paths
assert!(!is_nonprod_path(Path::new("src/main.rs")));
assert!(!is_nonprod_path(Path::new("lib/handler.py")));
assert!(!is_nonprod_path(Path::new("app/views.py")));
}
#[test]
fn severity_downgrade_works() {
assert_eq!(downgrade_severity(Severity::High), Severity::Medium);
assert_eq!(downgrade_severity(Severity::Medium), Severity::Low);
assert_eq!(downgrade_severity(Severity::Low), Severity::Low);
}
#[test]
fn nonprod_path_downgrades_findings() {
let dir = tempfile::tempdir().unwrap();
// Create a file under a "tests" directory
let test_dir = dir.path().join("tests");
std::fs::create_dir_all(&test_dir).unwrap();
let test_file = test_dir.join("test_cmd.py");
std::fs::write(
&test_file,
b"import os\ndef test():\n cmd = os.environ['X']\n os.system(cmd)\n",
)
.unwrap();
let default_cfg = Config::default();
let diags = run_rules_on_file(&test_file, &default_cfg, None, None).unwrap();
// All findings in tests/ should be downgraded (no HIGH)
let high: Vec<_> = diags
.iter()
.filter(|d| d.severity == Severity::High)
.collect();
assert!(
high.is_empty(),
"Findings in tests/ should be downgraded from HIGH; got {:?}",
high
);
// With include_nonprod=true, original severity preserved
let mut prod_cfg = Config::default();
prod_cfg.scanner.include_nonprod = true;
let diags_prod = run_rules_on_file(&test_file, &prod_cfg, None, None).unwrap();
// Not all diagnostics are necessarily high, but include_nonprod should not downgrade
// Just verify that if there are findings, they weren't downgraded by the nonprod logic
let _ = diags_prod;
}