Feat/configurable sanitizers and js precision (#32)

* chore: Exclude CLAUDE.md from Cargo.toml

* feat: Add configurable analysis rules and CLI commands for custom sanitizers and terminators

* feat: Enhance resource management and analysis efficiency

- Implemented parallel summary merging in `scan_filesystem` using rayon for improved performance.
- Introduced `GlobalSummaries::merge()` for efficient merging of summaries.
- Optimized file reading and hashing to eliminate redundant I/O operations.
- Added `should_scan_with_hash()` and `upsert_file_with_hash()` methods to streamline file processing.
- Enhanced taint analysis with in-place mutations to reduce memory allocations.
- Updated resource acquisition patterns to exclude false positives for `freopen` and wrapper functions.

* feat: Implement severity downgrade for findings in non-production paths and add source kind inference

* feat: Update versioning information in SECURITY.md for new stable line

* feat: Update categories in Cargo.toml to include parser-implementations and text-processing

* feat: Update dependencies in Cargo.lock for improved compatibility and performance

* feat: Update dependencies in Cargo.lock and Cargo.toml for improved compatibility
This commit is contained in:
Eli Peter 2026-02-25 04:02:11 -05:00 committed by GitHub
parent f96a89e7c1
commit 19b578c5c4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
37 changed files with 3775 additions and 432 deletions

View file

@ -2,6 +2,7 @@ use crate::cfg::{build_cfg, export_summaries};
use crate::cfg_analysis;
use crate::commands::scan::Diag;
use crate::errors::{NyxError, NyxResult};
use crate::labels::{build_lang_rules, severity_for_source_kind};
use crate::patterns::Severity;
use crate::summary::{FuncSummary, GlobalSummaries};
use crate::symbol::{Lang, normalize_namespace};
@ -53,6 +54,53 @@ fn is_binary(bytes: &[u8]) -> bool {
bytes.iter().filter(|b| **b == 0).count() * 100 / bytes.len().max(1) > 1
}
/// Check if a file path belongs to a non-production context (tests, vendor,
/// benchmarks, etc.). Used to downgrade severity for findings in paths that
/// are unlikely to represent attack surface.
fn is_nonprod_path(path: &Path) -> bool {
static NONPROD_DIRS: &[&str] = &[
"tests",
"test",
"__tests__",
"benches",
"benchmarks",
"examples",
"build",
"scripts",
"docs",
"js_tests",
"fixtures",
"vendor",
];
static NONPROD_FILES: &[&str] = &["build.rs"];
if let Some(name) = path.file_name().and_then(|n| n.to_str())
&& (NONPROD_FILES.contains(&name) || name.ends_with(".min.js"))
{
return true;
}
for component in path.components() {
if let std::path::Component::Normal(c) = component
&& let Some(s) = c.to_str()
&& NONPROD_DIRS.contains(&s)
{
return true;
}
}
false
}
/// Downgrade severity by one tier: High→Medium, Medium→Low, Low→Low.
fn downgrade_severity(s: Severity) -> Severity {
match s {
Severity::High => Severity::Medium,
Severity::Medium => Severity::Low,
Severity::Low => Severity::Low,
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Pass 1: Extract function summaries (no taint analysis)
// ─────────────────────────────────────────────────────────────────────────────
@ -84,7 +132,17 @@ pub fn extract_summaries_from_bytes(
})?;
let file_path_str = path.to_string_lossy();
let (_cfg_graph, _entry, local_summaries) = build_cfg(&tree, bytes, lang_slug, &file_path_str);
let lang_rules = build_lang_rules(_cfg, lang_slug);
let rules_ref = if lang_rules.extra_labels.is_empty()
&& lang_rules.terminators.is_empty()
&& lang_rules.event_handlers.is_empty()
{
None
} else {
Some(&lang_rules)
};
let (_cfg_graph, _entry, local_summaries) =
build_cfg(&tree, bytes, lang_slug, &file_path_str, rules_ref);
Ok(export_summaries(
&local_summaries,
@ -95,6 +153,7 @@ pub fn extract_summaries_from_bytes(
/// Convenience wrapper that reads the file then delegates to
/// [`extract_summaries_from_bytes`].
#[allow(dead_code)] // used by benchmarks and lib consumers
pub fn extract_summaries_from_file(path: &Path, cfg: &Config) -> NyxResult<Vec<FuncSummary>> {
let bytes = std::fs::read(path)?;
extract_summaries_from_bytes(&bytes, path, cfg)
@ -142,7 +201,17 @@ pub fn run_rules_on_bytes(
if needs_cfg {
// Build CFG — needed for both taint analysis and CFG structural analyses.
let (cfg_graph, entry, summaries) = build_cfg(&_tree, bytes, lang_slug, &file_path_str);
let lang_rules = build_lang_rules(cfg, lang_slug);
let rules_ref = if lang_rules.extra_labels.is_empty()
&& lang_rules.terminators.is_empty()
&& lang_rules.event_handlers.is_empty()
{
None
} else {
Some(&lang_rules)
};
let (cfg_graph, entry, summaries) =
build_cfg(&_tree, bytes, lang_slug, &file_path_str, rules_ref);
let caller_lang = Lang::from_slug(lang_slug).unwrap_or(Lang::Rust);
// ── Taint analysis ──────────────────────────────────────────────
@ -174,7 +243,7 @@ pub fn run_rules_on_bytes(
path: path.to_string_lossy().into_owned(),
line: sink_point.row + 1,
col: sink_point.column + 1,
severity: Severity::High,
severity: severity_for_source_kind(finding.source_kind),
id: format!(
"taint-unsanitised-flow (source {}:{})",
source_point.row + 1,
@ -184,6 +253,7 @@ pub fn run_rules_on_bytes(
}
// ── CFG structural analyses ─────────────────────────────────────
let taint_active = global_summaries.is_some() || !taint_results.is_empty();
let cfg_ctx = cfg_analysis::AnalysisContext {
cfg: &cfg_graph,
entry,
@ -193,6 +263,8 @@ pub fn run_rules_on_bytes(
func_summaries: &summaries,
global_summaries,
taint_findings: &taint_results,
analysis_rules: rules_ref,
taint_active,
};
for cf in cfg_analysis::run_all(&cfg_ctx) {
let point = byte_offset_to_point(&_tree, cf.span.0);
@ -238,6 +310,13 @@ pub fn run_rules_on_bytes(
a.line == b.line && a.col == b.col && a.id == b.id && a.severity == b.severity
});
// Downgrade severity for non-production paths unless opted out
if !cfg.scanner.include_nonprod && is_nonprod_path(path) {
for d in &mut out {
d.severity = downgrade_severity(d.severity);
}
}
Ok(out)
}
@ -253,6 +332,184 @@ pub fn run_rules_on_file(
run_rules_on_bytes(&bytes, path, cfg, global_summaries, scan_root)
}
// ─────────────────────────────────────────────────────────────────────────────
// Fused single-pass: extract summaries + run full analysis in one parse/CFG
// ─────────────────────────────────────────────────────────────────────────────
/// Result of a fused analysis pass: both function summaries and diagnostics.
pub struct FusedResult {
pub summaries: Vec<FuncSummary>,
pub diags: Vec<Diag>,
}
/// Parse the file once, build the CFG once, and produce both function
/// summaries (for cross-file resolution) and full diagnostics (AST queries +
/// taint + CFG structural analyses).
///
/// When `global_summaries` is `None`, the taint engine runs with local
/// context only (equivalent to pass 1 + partial pass 2). A second call
/// to [`run_taint_only`] can refine findings with the full cross-file view
/// without re-parsing or re-building the CFG.
pub fn analyse_file_fused(
bytes: &[u8],
path: &Path,
cfg: &Config,
global_summaries: Option<&GlobalSummaries>,
scan_root: Option<&Path>,
) -> NyxResult<FusedResult> {
let _span = tracing::debug_span!("analyse_fused", file = %path.display()).entered();
if is_binary(bytes) {
return Ok(FusedResult {
summaries: vec![],
diags: vec![],
});
}
let Some((ts_lang, lang_slug)) = lang_for_path(path) else {
return Ok(FusedResult {
summaries: vec![],
diags: vec![],
});
};
let tree = PARSER.with(|cell| {
let mut parser = cell.borrow_mut();
parser.set_language(&ts_lang)?;
parser
.parse(bytes, None)
.ok_or_else(|| NyxError::Other("tree-sitter failed".into()))
})?;
let file_path_str = path.to_string_lossy();
// Build language-specific analysis rules once
let lang_rules = build_lang_rules(cfg, lang_slug);
let rules_ref = if lang_rules.extra_labels.is_empty()
&& lang_rules.terminators.is_empty()
&& lang_rules.event_handlers.is_empty()
{
None
} else {
Some(&lang_rules)
};
// Build CFG once — used for both summary extraction AND analysis
let (cfg_graph, entry, local_summaries) =
build_cfg(&tree, bytes, lang_slug, &file_path_str, rules_ref);
// Export summaries (always — needed for cross-file merging)
let summaries = export_summaries(&local_summaries, &file_path_str, lang_slug);
let mut out = Vec::new();
// Taint + CFG structural analyses
let needs_cfg =
cfg.scanner.mode == AnalysisMode::Full || cfg.scanner.mode == AnalysisMode::Taint;
if needs_cfg {
let caller_lang = Lang::from_slug(lang_slug).unwrap_or(Lang::Rust);
let scan_root_str = scan_root.map(|p| p.to_string_lossy());
let namespace = normalize_namespace(&file_path_str, scan_root_str.as_deref());
let taint_results = analyse_file(
&cfg_graph,
entry,
&local_summaries,
global_summaries,
caller_lang,
&namespace,
&[],
);
for finding in &taint_results {
let sink_byte = cfg_graph[finding.sink].span.0;
let sink_point = byte_offset_to_point(&tree, sink_byte);
let source_byte = cfg_graph[finding.source].span.0;
let source_point = byte_offset_to_point(&tree, source_byte);
out.push(Diag {
path: path.to_string_lossy().into_owned(),
line: sink_point.row + 1,
col: sink_point.column + 1,
severity: severity_for_source_kind(finding.source_kind),
id: format!(
"taint-unsanitised-flow (source {}:{})",
source_point.row + 1,
source_point.column + 1
),
});
}
let taint_active = global_summaries.is_some() || !taint_results.is_empty();
let cfg_ctx = cfg_analysis::AnalysisContext {
cfg: &cfg_graph,
entry,
lang: caller_lang,
file_path: &file_path_str,
source_bytes: bytes,
func_summaries: &local_summaries,
global_summaries,
taint_findings: &taint_results,
analysis_rules: rules_ref,
taint_active,
};
for cf in cfg_analysis::run_all(&cfg_ctx) {
let point = byte_offset_to_point(&tree, cf.span.0);
out.push(Diag {
path: path.to_string_lossy().into_owned(),
line: point.row + 1,
col: point.column + 1,
severity: cf.severity,
id: cf.rule_id,
});
}
}
// AST pattern queries
if cfg.scanner.mode == AnalysisMode::Full || cfg.scanner.mode == AnalysisMode::Ast {
let root = tree.root_node();
let compiled = query_cache::for_lang(lang_slug, ts_lang);
let mut cursor = QueryCursor::new();
for cq in compiled.iter() {
if cfg.scanner.min_severity <= cq.meta.severity {
continue;
}
let mut matches = cursor.matches(&cq.query, root, bytes);
while let Some(m) = matches.next() {
if let Some(cap) = m.captures.iter().find(|c| c.index == 0) {
let point = cap.node.start_position();
out.push(Diag {
path: path.to_string_lossy().into_owned(),
line: point.row + 1,
col: point.column + 1,
severity: cq.meta.severity,
id: cq.meta.id.to_owned(),
});
}
}
}
}
// Dedup
out.sort_by(|a, b| (a.line, a.col, &a.id, a.severity).cmp(&(b.line, b.col, &b.id, b.severity)));
out.dedup_by(|a, b| {
a.line == b.line && a.col == b.col && a.id == b.id && a.severity == b.severity
});
// Downgrade severity for non-production paths unless opted out
if !cfg.scanner.include_nonprod && is_nonprod_path(path) {
for d in &mut out {
d.severity = downgrade_severity(d.severity);
}
}
Ok(FusedResult {
summaries,
diags: out,
})
}
#[test]
fn unknown_extension_returns_empty() {
let dir = tempfile::tempdir().unwrap();
@ -279,3 +536,65 @@ fn binary_file_guard_triggers() {
let diags = run_rules_on_file(&bin, &Config::default(), None, None).unwrap();
assert!(diags.is_empty(), "binary files are skipped");
}
#[test]
fn nonprod_path_detection() {
// Test that is_nonprod_path recognises common non-production paths
assert!(is_nonprod_path(Path::new("project/tests/test_main.py")));
assert!(is_nonprod_path(Path::new("src/__tests__/foo.js")));
assert!(is_nonprod_path(Path::new("benches/bench.rs")));
assert!(is_nonprod_path(Path::new("vendor/lib/foo.py")));
assert!(is_nonprod_path(Path::new("src/build.rs")));
assert!(is_nonprod_path(Path::new("dist/app.min.js")));
assert!(is_nonprod_path(Path::new("examples/demo.py")));
assert!(is_nonprod_path(Path::new("fixtures/data.json")));
// Should NOT match production paths
assert!(!is_nonprod_path(Path::new("src/main.rs")));
assert!(!is_nonprod_path(Path::new("lib/handler.py")));
assert!(!is_nonprod_path(Path::new("app/views.py")));
}
#[test]
fn severity_downgrade_works() {
assert_eq!(downgrade_severity(Severity::High), Severity::Medium);
assert_eq!(downgrade_severity(Severity::Medium), Severity::Low);
assert_eq!(downgrade_severity(Severity::Low), Severity::Low);
}
#[test]
fn nonprod_path_downgrades_findings() {
let dir = tempfile::tempdir().unwrap();
// Create a file under a "tests" directory
let test_dir = dir.path().join("tests");
std::fs::create_dir_all(&test_dir).unwrap();
let test_file = test_dir.join("test_cmd.py");
std::fs::write(
&test_file,
b"import os\ndef test():\n cmd = os.environ['X']\n os.system(cmd)\n",
)
.unwrap();
let default_cfg = Config::default();
let diags = run_rules_on_file(&test_file, &default_cfg, None, None).unwrap();
// All findings in tests/ should be downgraded (no HIGH)
let high: Vec<_> = diags
.iter()
.filter(|d| d.severity == Severity::High)
.collect();
assert!(
high.is_empty(),
"Findings in tests/ should be downgraded from HIGH; got {:?}",
high
);
// With include_nonprod=true, original severity preserved
let mut prod_cfg = Config::default();
prod_cfg.scanner.include_nonprod = true;
let diags_prod = run_rules_on_file(&test_file, &prod_cfg, None, None).unwrap();
// Not all diagnostics are necessarily high, but include_nonprod should not downgrade
// Just verify that if there are findings, they weren't downgraded by the nonprod logic
let _ = diags_prod;
}

View file

@ -3,7 +3,7 @@ use petgraph::prelude::*;
use tracing::debug;
use tree_sitter::{Node, Tree};
use crate::labels::{Cap, DataLabel, Kind, classify, lookup, param_config};
use crate::labels::{Cap, DataLabel, Kind, LangAnalysisRules, classify, lookup, param_config};
use crate::summary::FuncSummary;
use crate::symbol::{FuncKey, Lang};
use std::collections::{HashMap, HashSet};
@ -186,7 +186,12 @@ fn member_expr_text(n: Node, code: &[u8]) -> Option<String> {
}
/// Recursively search `n` for a member expression whose text classifies as a label.
fn first_member_label(n: Node, lang: &str, code: &[u8]) -> Option<DataLabel> {
fn first_member_label(
n: Node,
lang: &str,
code: &[u8],
extra_labels: Option<&[crate::labels::RuntimeLabelRule]>,
) -> Option<DataLabel> {
match n.kind() {
"member_expression" | "attribute" | "selector_expression" => {
if let Some(full) = member_expr_text(n, code) {
@ -194,7 +199,7 @@ fn first_member_label(n: Node, lang: &str, code: &[u8]) -> Option<DataLabel> {
// to match rules like "process.env" from "process.env.CMD".
let mut candidate = full.as_str();
loop {
if let Some(lbl) = classify(lang, candidate) {
if let Some(lbl) = classify(lang, candidate, extra_labels) {
return Some(lbl);
}
match candidate.rsplit_once('.') {
@ -208,7 +213,7 @@ fn first_member_label(n: Node, lang: &str, code: &[u8]) -> Option<DataLabel> {
}
let mut cursor = n.walk();
for child in n.children(&mut cursor) {
if let Some(lbl) = first_member_label(child, lang, code) {
if let Some(lbl) = first_member_label(child, lang, code, extra_labels) {
return Some(lbl);
}
}
@ -366,6 +371,7 @@ fn def_use(ast: Node, lang: &str, code: &[u8]) -> (Option<String>, Vec<String>)
}
/// Create a node in one short borrow and optionally attach a taint label.
#[allow(clippy::too_many_arguments)]
fn push_node<'a>(
g: &mut Cfg,
kind: StmtKind,
@ -374,6 +380,7 @@ fn push_node<'a>(
code: &'a [u8],
enclosing_func: Option<&str>,
call_ordinal: u32,
analysis_rules: Option<&LangAnalysisRules>,
) -> NodeIndex {
/* ── 1. IDENTIFIER EXTRACTION ─────────────────────────────────────── */
@ -427,7 +434,8 @@ fn push_node<'a>(
/* ── 2. LABEL LOOK-UP ───────────────────────────────────────────── */
let mut label = classify(lang, &text);
let extra = analysis_rules.map(|r| r.extra_labels.as_slice());
let mut label = classify(lang, &text, extra);
// For assignments like `element.innerHTML = value`, the inner-call heuristic
// above may have overridden `text` with a call on the RHS (e.g. getElementById).
@ -450,10 +458,20 @@ fn push_node<'a>(
if let Some(assign) = assign_node
&& let Some(lhs) = assign.child_by_field_name("left")
&& let Some(prop) = lhs.child_by_field_name("property")
&& let Some(prop_text) = text_of(prop, code)
{
label = classify(lang, &prop_text);
// Try full member expression first (e.g. "location.href") — more
// specific and avoids false positives on `a.href`.
if let Some(full) = member_expr_text(lhs, code) {
label = classify(lang, &full, extra);
}
// Fall back to property-only (e.g. "innerHTML") for sinks that
// don't need object context.
if label.is_none()
&& let Some(prop) = lhs.child_by_field_name("property")
&& let Some(prop_text) = text_of(prop, code)
{
label = classify(lang, &prop_text, extra);
}
}
}
@ -466,7 +484,7 @@ fn push_node<'a>(
lookup(lang, ast.kind()),
Kind::CallWrapper | Kind::Assignment
)
&& let Some(found) = first_member_label(ast, lang, code)
&& let Some(found) = first_member_label(ast, lang, code, extra)
{
label = Some(found);
// Update text so the callee name reflects the source
@ -564,6 +582,19 @@ fn extract_param_names<'a>(func_node: Node<'a>, lang: &str, code: &'a [u8]) -> V
names
}
/// Check if a callee name matches any configured terminator.
fn is_configured_terminator(callee: &str, analysis_rules: Option<&LangAnalysisRules>) -> bool {
if let Some(rules) = analysis_rules {
let callee_lower = callee.to_ascii_lowercase();
rules
.terminators
.iter()
.any(|t| callee_lower == t.to_ascii_lowercase())
} else {
false
}
}
/// Add the same edge (of the same kind) from every node in `froms` to `to`.
#[inline]
fn connect_all(g: &mut Cfg, froms: &[NodeIndex], to: NodeIndex, kind: EdgeKind) {
@ -588,6 +619,9 @@ fn build_sub<'a>(
file_path: &str,
enclosing_func: Option<&str>,
call_ordinal: &mut u32,
analysis_rules: Option<&LangAnalysisRules>,
break_targets: &mut Vec<NodeIndex>,
continue_targets: &mut Vec<NodeIndex>,
) -> Vec<NodeIndex> {
match lookup(lang, ast.kind()) {
// ─────────────────────────────────────────────────────────────────
@ -595,7 +629,16 @@ fn build_sub<'a>(
// ─────────────────────────────────────────────────────────────────
Kind::If => {
// Condition node
let cond = push_node(g, StmtKind::If, ast, lang, code, enclosing_func, 0);
let cond = push_node(
g,
StmtKind::If,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, preds, cond, EdgeKind::Seq);
// Locate then & else blocks using field-based lookup first,
@ -620,6 +663,7 @@ fn build_sub<'a>(
};
// THEN branch
let then_first_node = NodeIndex::new(g.node_count());
let then_exits = if let Some(b) = then_block {
let exits = build_sub(
b,
@ -631,9 +675,17 @@ fn build_sub<'a>(
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
);
// True edges leave the condition
if let Some(&first) = exits.first() {
// Add True edge from condition to first node of then-branch.
// We use the first node created (by index) rather than the
// exit, because the branch may terminate (return/break) and
// have no exits.
if then_first_node.index() < g.node_count() {
connect_all(g, &[cond], then_first_node, EdgeKind::True);
} else if let Some(&first) = exits.first() {
connect_all(g, &[cond], first, EdgeKind::True);
}
exits
@ -642,6 +694,7 @@ fn build_sub<'a>(
};
// ELSE branch
let else_first_node = NodeIndex::new(g.node_count());
let else_exits = if let Some(b) = else_block {
let exits = build_sub(
b,
@ -653,17 +706,30 @@ fn build_sub<'a>(
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
);
if let Some(&first) = exits.first() {
if else_first_node.index() < g.node_count() {
connect_all(g, &[cond], else_first_node, EdgeKind::False);
} else if let Some(&first) = exits.first() {
connect_all(g, &[cond], first, EdgeKind::False);
}
exits
} else {
// No explicit else → non-taken branch flows to the *then* exits
if let Some(&first) = then_exits.first() {
connect_all(g, &[cond], first, EdgeKind::False);
// No explicit else → if the then-branch falls through
// (non-empty exits), the false branch merges with those exits.
// If the then-branch terminates (break/return/continue →
// empty exits), the false branch flows from the condition
// to whatever comes next.
if then_exits.is_empty() {
vec![cond]
} else {
if let Some(&first) = then_exits.first() {
connect_all(g, &[cond], first, EdgeKind::False);
}
then_exits.clone()
}
then_exits.clone()
};
// Frontier = union of both branches
@ -672,9 +738,22 @@ fn build_sub<'a>(
Kind::InfiniteLoop => {
// Synthetic header node
let header = push_node(g, StmtKind::Loop, ast, lang, code, enclosing_func, 0);
let header = push_node(
g,
StmtKind::Loop,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, preds, header, EdgeKind::Seq);
// Fresh break/continue targets scoped to this loop
let mut loop_breaks = Vec::new();
let mut loop_continues = Vec::new();
// The body is the single `block` child
let body = ast.child_by_field_name("body").expect("loop without body");
let body_exits = build_sub(
@ -687,23 +766,49 @@ fn build_sub<'a>(
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
&mut loop_breaks,
&mut loop_continues,
);
// Back-edge from every linear exit to header
for &e in &body_exits {
connect_all(g, &[e], header, EdgeKind::Back);
}
// `loop` may break → those exits are frontiers too
body_exits.into_iter().chain([header]).collect()
// Wire continue targets as back edges to header
for &c in &loop_continues {
connect_all(g, &[c], header, EdgeKind::Back);
}
// Break targets become exits of the loop
if loop_breaks.is_empty() {
// No break → infinite loop; header is the only exit for
// downstream code (fallthrough semantics)
vec![header]
} else {
loop_breaks
}
}
// ─────────────────────────────────────────────────────────────────
// WHILE / FOR: classic loop with a back edge.
// ─────────────────────────────────────────────────────────────────
Kind::While | Kind::For => {
let header = push_node(g, StmtKind::Loop, ast, lang, code, enclosing_func, 0);
let header = push_node(
g,
StmtKind::Loop,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, preds, header, EdgeKind::Seq);
// Fresh break/continue targets scoped to this loop
let mut loop_breaks = Vec::new();
let mut loop_continues = Vec::new();
// Body = first (and usually only) block child.
let body = ast
.child_by_field_name("body")
@ -724,14 +829,24 @@ fn build_sub<'a>(
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
&mut loop_breaks,
&mut loop_continues,
);
// Backedge for every linear exit → header.
for &e in &body_exits {
connect_all(g, &[e], header, EdgeKind::Back);
}
// Falling out of the loop = headers false branch.
vec![header]
// Wire continue targets as back edges to header
for &c in &loop_continues {
connect_all(g, &[c], header, EdgeKind::Back);
}
// Falling out of the loop = headers false branch +
// any break targets that exit the loop.
let mut exits = vec![header];
exits.extend(loop_breaks);
exits
}
// ─────────────────────────────────────────────────────────────────
@ -743,25 +858,72 @@ fn build_sub<'a>(
// that callee labels (source/sanitizer/sink) are applied.
let ord = *call_ordinal;
*call_ordinal += 1;
let call_idx = push_node(g, StmtKind::Call, ast, lang, code, enclosing_func, ord);
let call_idx = push_node(
g,
StmtKind::Call,
ast,
lang,
code,
enclosing_func,
ord,
analysis_rules,
);
connect_all(g, preds, call_idx, EdgeKind::Seq);
let ret = push_node(g, StmtKind::Return, ast, lang, code, enclosing_func, 0);
let ret = push_node(
g,
StmtKind::Return,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, &[call_idx], ret, EdgeKind::Seq);
Vec::new()
} else {
let ret = push_node(g, StmtKind::Return, ast, lang, code, enclosing_func, 0);
let ret = push_node(
g,
StmtKind::Return,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, preds, ret, EdgeKind::Seq);
Vec::new() // terminates this path
}
}
Kind::Break => {
let brk = push_node(g, StmtKind::Break, ast, lang, code, enclosing_func, 0);
let brk = push_node(
g,
StmtKind::Break,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, preds, brk, EdgeKind::Seq);
break_targets.push(brk);
Vec::new()
}
Kind::Continue => {
let cont = push_node(g, StmtKind::Continue, ast, lang, code, enclosing_func, 0);
let cont = push_node(
g,
StmtKind::Continue,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, preds, cont, EdgeKind::Seq);
continue_targets.push(cont);
Vec::new()
}
@ -774,6 +936,7 @@ fn build_sub<'a>(
// Track the last frontier before a function emptied it — used to
// keep subsequent functions reachable.
let mut last_live_frontier = preds.to_vec();
let mut prev_was_preproc = false;
for child in ast.children(&mut cursor) {
let child_is_fn = lookup(lang, child.kind()) == Kind::Function;
@ -782,7 +945,13 @@ fn build_sub<'a>(
// file-level predecessors. Without this, a preceding function
// that ends with `return` (frontier = []) would leave subsequent
// functions disconnected from the graph.
let child_preds = if child_is_fn && frontier.is_empty() {
//
// Similarly, when a preprocessor block (`#ifdef ... #endif`)
// contains an `if/else` whose else branch is on the other side
// of the `#endif`, tree-sitter parses a dangling else that
// empties the frontier. The code after the preproc block should
// remain reachable.
let child_preds = if frontier.is_empty() && (child_is_fn || prev_was_preproc) {
last_live_frontier.clone()
} else {
frontier.clone()
@ -798,12 +967,17 @@ fn build_sub<'a>(
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
);
let is_preproc = child.kind().starts_with("preproc_");
if !child_exits.is_empty() {
last_live_frontier = child_exits.clone();
}
frontier = child_exits;
prev_was_preproc = is_preproc;
}
frontier
}
@ -822,7 +996,16 @@ fn build_sub<'a>(
tmp.into_iter().next()
})
.unwrap_or_else(|| "<anon>".to_string());
let entry_idx = push_node(g, StmtKind::Seq, ast, lang, code, Some(&fn_name), 0);
let entry_idx = push_node(
g,
StmtKind::Seq,
ast,
lang,
code,
Some(&fn_name),
0,
analysis_rules,
);
connect_all(g, preds, entry_idx, EdgeKind::Seq);
// 1b) extract parameter names
@ -830,8 +1013,13 @@ fn build_sub<'a>(
let param_count = param_names.len();
// 2) build its body with a fresh call ordinal counter for this function scope
// Snapshot the current node count so we can iterate only over nodes
// created within this function (avoids O(N²) scan of the full graph).
let fn_first_node: NodeIndex = NodeIndex::new(g.node_count());
let body = ast.child_by_field_name("body").expect("fn w/o body");
let mut fn_call_ordinal: u32 = 0;
let mut fn_breaks = Vec::new();
let mut fn_continues = Vec::new();
let body_exits = build_sub(
body,
&[entry_idx],
@ -842,6 +1030,9 @@ fn build_sub<'a>(
file_path,
Some(&fn_name),
&mut fn_call_ordinal,
analysis_rules,
&mut fn_breaks,
&mut fn_continues,
);
// ───── 3) light-weight dataflow ──────────────────────────────────────
@ -862,11 +1053,12 @@ fn build_sub<'a>(
let param_set: HashSet<&str> = param_names.iter().map(|s| s.as_str()).collect();
for idx in g.node_indices() {
// Iterate only over nodes created within this function scope
// (entry_idx .. current end) instead of the entire graph.
let fn_node_range = entry_idx.index()..g.node_count();
for raw in fn_node_range {
let idx = NodeIndex::new(raw);
let info = &g[idx];
if info.span.0 < ast.start_byte() || info.span.1 > ast.end_byte() {
continue;
}
// collect callee names
if let Some(callee) = &info.callee
@ -1010,11 +1202,12 @@ fn build_sub<'a>(
// this edge, the synthetic exit node is unreachable whenever
// the function body ends with a `return` statement, which
// disconnects all subsequent functions at the module level.
for idx in g.node_indices() {
//
// Only scan nodes created within this function scope.
for raw in fn_first_node.index()..g.node_count() {
let idx = NodeIndex::new(raw);
let info = &g[idx];
if info.kind == StmtKind::Return
&& info.span.0 >= ast.start_byte()
&& info.span.1 <= ast.end_byte()
&& idx != exit_idx
&& !g.contains_edge(idx, exit_idx)
{
@ -1068,6 +1261,9 @@ fn build_sub<'a>(
file_path,
enclosing_func,
call_ordinal,
analysis_rules,
break_targets,
continue_targets,
);
}
@ -1085,8 +1281,25 @@ fn build_sub<'a>(
} else {
0
};
let node = push_node(g, kind, ast, lang, code, enclosing_func, ord);
let node = push_node(
g,
kind,
ast,
lang,
code,
enclosing_func,
ord,
analysis_rules,
);
connect_all(g, preds, node, EdgeKind::Seq);
// If the callee is a configured terminator, treat as a dead end
if kind == StmtKind::Call
&& let Some(callee) = &g[node].callee
&& is_configured_terminator(callee, analysis_rules)
{
return Vec::new();
}
vec![node]
}
@ -1095,8 +1308,24 @@ fn build_sub<'a>(
Kind::CallFn | Kind::CallMethod | Kind::CallMacro => {
let ord = *call_ordinal;
*call_ordinal += 1;
let n = push_node(g, StmtKind::Call, ast, lang, code, enclosing_func, ord);
let n = push_node(
g,
StmtKind::Call,
ast,
lang,
code,
enclosing_func,
ord,
analysis_rules,
);
connect_all(g, preds, n, EdgeKind::Seq);
// If the callee is a configured terminator, treat as a dead end
if let Some(callee) = &g[n].callee
&& is_configured_terminator(callee, analysis_rules)
{
return Vec::new();
}
vec![n]
}
@ -1115,7 +1344,16 @@ fn build_sub<'a>(
} else {
0
};
let n = push_node(g, kind, ast, lang, code, enclosing_func, ord);
let n = push_node(
g,
kind,
ast,
lang,
code,
enclosing_func,
ord,
analysis_rules,
);
connect_all(g, preds, n, EdgeKind::Seq);
vec![n]
}
@ -1127,7 +1365,16 @@ fn build_sub<'a>(
// Every other node = simple sequential statement
// ─────────────────────────────────────────────────────────────────
_ => {
let n = push_node(g, StmtKind::Seq, ast, lang, code, enclosing_func, 0);
let n = push_node(
g,
StmtKind::Seq,
ast,
lang,
code,
enclosing_func,
0,
analysis_rules,
);
connect_all(g, preds, n, EdgeKind::Seq);
vec![n]
}
@ -1150,6 +1397,7 @@ pub(crate) fn build_cfg<'a>(
code: &'a [u8],
lang: &str,
file_path: &str,
analysis_rules: Option<&LangAnalysisRules>,
) -> (Cfg, NodeIndex, FuncSummaries) {
debug!(target: "cfg", "Building CFG for {:?}", tree.root_node());
@ -1178,6 +1426,8 @@ pub(crate) fn build_cfg<'a>(
// Build the body below the synthetic ENTRY.
let mut top_ordinal: u32 = 0;
let mut top_breaks = Vec::new();
let mut top_continues = Vec::new();
let exits = build_sub(
tree.root_node(),
&[entry],
@ -1188,6 +1438,9 @@ pub(crate) fn build_cfg<'a>(
file_path,
None,
&mut top_ordinal,
analysis_rules,
&mut top_breaks,
&mut top_continues,
);
debug!(target: "cfg", "exits: {:?}", exits);
// Wire every real exit to our synthetic EXIT node.

View file

@ -2,15 +2,75 @@ use super::dominators::{self, dominates};
use super::rules;
use super::{AnalysisContext, CfgAnalysis, CfgFinding, Confidence, is_entry_point_func};
use crate::cfg::StmtKind;
use crate::labels::{Cap, DataLabel};
use crate::labels::{Cap, DataLabel, RuntimeLabelRule};
use crate::patterns::Severity;
use petgraph::graph::NodeIndex;
pub struct UnguardedSink;
/// Check whether **all** arguments to the sink are constants (no taint-capable
/// variable flows). Extends the inline callee-part check by tracing one hop
/// through the CFG: if a used variable is defined by a node that itself has
/// empty `uses` and no Source label, the definition is treated as a constant
/// binding (e.g. `let cmd = "git"; Command::new(cmd)`).
fn is_all_args_constant(ctx: &AnalysisContext, sink: NodeIndex) -> bool {
let sink_info = &ctx.cfg[sink];
let callee_desc = sink_info.callee.as_deref().unwrap_or("");
let callee_parts: Vec<&str> = callee_desc.split(['.', ':']).collect();
let sink_func = sink_info.enclosing_func.as_deref();
sink_info.uses.iter().all(|u| {
// Part of the callee name itself → constant
if callee_parts.contains(&u.as_str()) {
return true;
}
// One-hop trace: find the defining node in the same function
for idx in ctx.cfg.node_indices() {
let info = &ctx.cfg[idx];
if info.enclosing_func.as_deref() != sink_func {
continue;
}
if info.defines.as_deref() == Some(u.as_str()) {
// If the defining node has no uses (pure constant) and is not
// a Source, the variable is constant.
if info.uses.is_empty() && !matches!(info.label, Some(DataLabel::Source(_))) {
return true;
}
}
}
false
})
}
/// Check if a callee matches any of the runtime label rules that are sanitizers.
fn match_config_sanitizer(callee: &str, extra: &[RuntimeLabelRule]) -> Option<Cap> {
let callee_lower = callee.to_ascii_lowercase();
for rule in extra {
let cap = match rule.label {
DataLabel::Sanitizer(c) => c,
_ => continue,
};
for m in &rule.matchers {
let ml = m.to_ascii_lowercase();
if ml.ends_with('_') {
if callee_lower.starts_with(&ml) {
return Some(cap);
}
} else if callee_lower.ends_with(&ml) {
return Some(cap);
}
}
}
None
}
/// Find all nodes in the CFG that are calls to guard functions.
fn find_guard_nodes(ctx: &AnalysisContext) -> Vec<(NodeIndex, Cap)> {
let guard_rules = rules::guard_rules(ctx.lang);
let config_rules = ctx
.analysis_rules
.map(|r| r.extra_labels.as_slice())
.unwrap_or(&[]);
let mut result = Vec::new();
for idx in ctx.cfg.node_indices() {
@ -19,6 +79,13 @@ fn find_guard_nodes(ctx: &AnalysisContext) -> Vec<(NodeIndex, Cap)> {
continue;
}
if let Some(callee) = &info.callee {
// Check config sanitizer rules first
if let Some(cap) = match_config_sanitizer(callee, config_rules) {
result.push((idx, cap));
continue;
}
// Then check built-in guard rules
let callee_lower = callee.to_ascii_lowercase();
for rule in guard_rules {
let matched = rule.matchers.iter().any(|m| {
@ -174,6 +241,13 @@ impl CfgAnalysis for UnguardedSink {
let has_taint = taint_confirms_sink(ctx, *sink);
let source_derived = sink_arg_is_source_derived(ctx, *sink);
// If sink args are all constants (including one-hop constant bindings)
// and taint didn't confirm, this is a false positive — skip it.
if is_all_args_constant(ctx, *sink) && !has_taint && !source_derived {
continue;
}
let param_only = sink_arg_is_parameter_only(ctx, *sink);
let in_entrypoint = sink_in_entrypoint(ctx, *sink);
@ -183,6 +257,9 @@ impl CfgAnalysis for UnguardedSink {
} else if param_only && !in_entrypoint {
// Wrapper function consuming only parameters → LOW
(Severity::Low, Confidence::Low)
} else if !ctx.taint_active && !source_derived {
// CFG-only mode without taint confirmation → LOW
(Severity::Low, Confidence::Low)
} else if in_entrypoint && !param_only {
// Entrypoint with non-parameter args but no taint confirmation → MEDIUM
(Severity::Medium, Confidence::Medium)

View file

@ -10,7 +10,7 @@ mod tests;
pub mod unreachable;
use crate::cfg::{FuncSummaries, NodeInfo, StmtKind};
use crate::labels::DataLabel;
use crate::labels::{DataLabel, LangAnalysisRules};
use crate::patterns::Severity;
use crate::summary::GlobalSummaries;
use crate::symbol::Lang;
@ -51,6 +51,11 @@ pub struct AnalysisContext<'a> {
#[allow(dead_code)]
pub global_summaries: Option<&'a GlobalSummaries>,
pub taint_findings: &'a [taint::Finding],
pub analysis_rules: Option<&'a LangAnalysisRules>,
/// Whether full taint analysis was active for this file (global summaries
/// existed and taint engine ran). When false, structural findings without
/// taint confirmation should be treated with lower confidence.
pub taint_active: bool,
}
pub trait CfgAnalysis {
@ -87,6 +92,20 @@ pub fn run_all(ctx: &AnalysisContext) -> Vec<CfgFinding> {
true
});
// ── Dedup: suppress cfg-unguarded-sink when cfg-unreachable-sink covers the span ──
let unreachable_spans: HashSet<(usize, usize)> = findings
.iter()
.filter(|f| f.rule_id == "cfg-unreachable-sink")
.map(|f| f.span)
.collect();
findings.retain(|f| {
if f.rule_id == "cfg-unguarded-sink" && unreachable_spans.contains(&f.span) {
return false;
}
true
});
scoring::score_findings(&mut findings, ctx);
findings.sort_by(|a, b| {
b.score
@ -97,11 +116,36 @@ pub fn run_all(ctx: &AnalysisContext) -> Vec<CfgFinding> {
}
/// Helper: check whether a node is a guard call (validate, sanitize, check, etc.).
pub(crate) fn is_guard_call(info: &NodeInfo, lang: Lang) -> bool {
pub(crate) fn is_guard_call(
info: &NodeInfo,
lang: Lang,
analysis_rules: Option<&LangAnalysisRules>,
) -> bool {
if info.kind != StmtKind::Call {
return false;
}
if let Some(callee) = &info.callee {
// Check config sanitizer rules
if let Some(extras) = analysis_rules {
let callee_lower = callee.to_ascii_lowercase();
for rule in &extras.extra_labels {
if !matches!(rule.label, DataLabel::Sanitizer(_)) {
continue;
}
for m in &rule.matchers {
let ml = m.to_ascii_lowercase();
if ml.ends_with('_') {
if callee_lower.starts_with(&ml) {
return true;
}
} else if callee_lower.ends_with(&ml) {
return true;
}
}
}
}
// Check built-in guard rules
let guard_rules = rules::guard_rules(lang);
let callee_lower = callee.to_ascii_lowercase();
for rule in guard_rules {

View file

@ -8,8 +8,13 @@ use std::collections::HashSet;
pub struct ResourceMisuse;
/// Find nodes matching acquire patterns for a given resource pair.
fn find_acquire_nodes(ctx: &AnalysisContext, acquire_patterns: &[&str]) -> Vec<NodeIndex> {
/// Find nodes matching acquire patterns for a given resource pair,
/// excluding any that match `exclude_patterns`.
fn find_acquire_nodes(
ctx: &AnalysisContext,
acquire_patterns: &[&str],
exclude_patterns: &[&str],
) -> Vec<NodeIndex> {
ctx.cfg
.node_indices()
.filter(|&idx| {
@ -19,6 +24,16 @@ fn find_acquire_nodes(ctx: &AnalysisContext, acquire_patterns: &[&str]) -> Vec<N
}
if let Some(callee) = &info.callee {
let callee_lower = callee.to_ascii_lowercase();
// Check exclusions first — if the callee matches an exclude
// pattern, it is NOT an acquire even if it also matches an
// acquire pattern (e.g. `freopen` ends with `fopen`).
let excluded = exclude_patterns.iter().any(|p| {
let pl = p.to_ascii_lowercase();
callee_lower.ends_with(&pl) || callee_lower == pl
});
if excluded {
return false;
}
acquire_patterns.iter().any(|p| {
let pl = p.to_ascii_lowercase();
callee_lower.ends_with(&pl) || callee_lower == pl
@ -113,6 +128,204 @@ fn all_paths_pass_through(
true
}
/// Check whether the acquired variable is stored into a struct field (ownership
/// transfer) downstream of the acquire node. Patterns recognised:
/// - `ptr->field = var` (C arrow operator)
/// - `obj.field = var` (C dot / generic field store)
/// - `list->next = ...` (linked-list insertion)
///
/// If the variable is transferred, there is no leak — the receiving struct is
/// responsible for the lifetime.
fn is_ownership_transferred(ctx: &AnalysisContext, acquire: NodeIndex) -> bool {
let acquired_var = match &ctx.cfg[acquire].defines {
Some(v) => v.clone(),
None => return false,
};
// BFS through CFG successors looking for a node whose span text
// mentions the acquired variable in a struct-field store context.
use std::collections::VecDeque;
let mut visited = HashSet::new();
let mut queue = VecDeque::new();
for succ in ctx.cfg.neighbors(acquire) {
if visited.insert(succ) {
queue.push_back(succ);
}
}
while let Some(node) = queue.pop_front() {
let info = &ctx.cfg[node];
let (start, end) = info.span;
// Check the source text at this node's span for the acquired variable
// appearing in a struct-field store context.
let references_var = info.uses.iter().any(|u| u == &acquired_var)
|| info.defines.as_ref().is_some_and(|d| d == &acquired_var);
if references_var && start < end && end <= ctx.source_bytes.len() {
let span_text = &ctx.source_bytes[start..end];
// `->` anywhere in span means pointer-to-member store
if span_text.windows(2).any(|w| w == b"->") {
return true;
}
// `.field = var` pattern (but not `==`)
if has_dot_field_assignment(span_text) {
return true;
}
}
// If the variable is truly redefined (not a field write), stop
// following this path. A true redefinition is when `defines` matches
// but the span doesn't contain `->` or `.field =` patterns.
if info.defines.as_ref().is_some_and(|d| d == &acquired_var) {
let is_field_write = if start < end && end <= ctx.source_bytes.len() {
let span_text = &ctx.source_bytes[start..end];
span_text.windows(2).any(|w| w == b"->") || has_dot_field_assignment(span_text)
} else {
false
};
if !is_field_write {
continue; // genuine redefinition — stop this path
}
}
for succ in ctx.cfg.neighbors(node) {
if visited.insert(succ) {
queue.push_back(succ);
}
}
}
false
}
/// Check if `span_text` contains a dot-field assignment pattern like
/// `obj.field = var` (but not `obj.method(...)` or `a == b`).
fn has_dot_field_assignment(span_text: &[u8]) -> bool {
// Look for `.` followed (possibly with ident chars) by `=` but not `==`
let mut i = 0;
while i < span_text.len() {
if span_text[i] == b'.' {
// Scan forward past identifier chars to find `=`
let mut j = i + 1;
while j < span_text.len()
&& (span_text[j].is_ascii_alphanumeric() || span_text[j] == b'_')
{
j += 1;
}
// Skip whitespace
while j < span_text.len() && span_text[j].is_ascii_whitespace() {
j += 1;
}
// Check for `=` but not `==`
if j < span_text.len()
&& span_text[j] == b'='
&& (j + 1 >= span_text.len() || span_text[j + 1] != b'=')
{
return true;
}
}
i += 1;
}
false
}
/// Check whether the acquired variable is consumed by an ownership-taking
/// function (e.g. `FileResponse(f)`, `send_file(f)`) downstream of the
/// acquire node. These functions take ownership of the file handle so there
/// is no leak.
fn is_consumed_by_owner(ctx: &AnalysisContext, acquire: NodeIndex) -> bool {
static CONSUMING_SINKS: &[&str] = &[
"fileresponse",
"streaminghttpresponse",
"send_file",
"make_response",
];
let acquired_var = match &ctx.cfg[acquire].defines {
Some(v) => v.clone(),
None => return false,
};
use std::collections::VecDeque;
let mut visited = HashSet::new();
let mut queue = VecDeque::new();
for succ in ctx.cfg.neighbors(acquire) {
if visited.insert(succ) {
queue.push_back(succ);
}
}
while let Some(node) = queue.pop_front() {
let info = &ctx.cfg[node];
// Check Call nodes with callee that matches a consuming sink
if info.kind == StmtKind::Call
&& let Some(callee) = &info.callee
{
let callee_lower = callee.to_ascii_lowercase();
let is_consuming = CONSUMING_SINKS.iter().any(|s| callee_lower.ends_with(s));
if is_consuming && info.uses.iter().any(|u| u == &acquired_var) {
return true;
}
}
// Also check the span text for consuming calls — handles cases where
// the call is embedded in a return statement (e.g. `return FileResponse(f)`)
if info.uses.iter().any(|u| u == &acquired_var) {
let (start, end) = info.span;
if start < end && end <= ctx.source_bytes.len() {
let span_lower: Vec<u8> = ctx.source_bytes[start..end]
.iter()
.map(|b| b.to_ascii_lowercase())
.collect();
if CONSUMING_SINKS
.iter()
.any(|s| span_lower.windows(s.len()).any(|w| w == s.as_bytes()))
{
return true;
}
}
}
for succ in ctx.cfg.neighbors(node) {
if visited.insert(succ) {
queue.push_back(succ);
}
}
}
false
}
/// For mutex pairs, check that an explicit `.acquire()` or `.lock()` call
/// exists on the acquired variable in the CFG. If only the constructor
/// (e.g. `threading.Lock()`) is observed without acquire, skip the finding.
fn has_explicit_lock_acquire(ctx: &AnalysisContext, acquire: NodeIndex) -> bool {
let acquired_var = match &ctx.cfg[acquire].defines {
Some(v) => v.clone(),
None => return false,
};
for idx in ctx.cfg.node_indices() {
let info = &ctx.cfg[idx];
if info.kind != StmtKind::Call {
continue;
}
if let Some(callee) = &info.callee {
let callee_lower = callee.to_ascii_lowercase();
let is_lock_call = callee_lower.ends_with(".acquire")
|| callee_lower.ends_with(".lock")
|| callee_lower == "pthread_mutex_lock";
if is_lock_call && info.uses.iter().any(|u| u == &acquired_var) {
return true;
}
}
}
false
}
impl CfgAnalysis for ResourceMisuse {
fn name(&self) -> &'static str {
"resource-misuse"
@ -128,11 +341,18 @@ impl CfgAnalysis for ResourceMisuse {
let mut findings = Vec::new();
for pair in pairs {
let acquire_nodes = find_acquire_nodes(ctx, pair.acquire);
let acquire_nodes = find_acquire_nodes(ctx, pair.acquire, pair.exclude_acquire);
let release_nodes = find_release_nodes(ctx, pair.release);
for &acquire in &acquire_nodes {
if !release_on_all_exit_paths(ctx, acquire, &release_nodes, exit) {
if !release_on_all_exit_paths(ctx, acquire, &release_nodes, exit)
&& !is_ownership_transferred(ctx, acquire)
&& !is_consumed_by_owner(ctx, acquire)
{
// For mutex pairs, require an explicit .acquire()/.lock() call
if pair.resource_name == "mutex" && !has_explicit_lock_acquire(ctx, acquire) {
continue;
}
let info = &ctx.cfg[acquire];
let callee_desc = info.callee.as_deref().unwrap_or("(acquire)");

View file

@ -21,6 +21,9 @@ pub struct EntryPointRule {
pub struct ResourcePair {
pub acquire: &'static [&'static str],
pub release: &'static [&'static str],
/// Patterns that look like acquire calls (e.g. `freopen` ends with `fopen`)
/// but should NOT be treated as acquisitions.
pub exclude_acquire: &'static [&'static str],
pub resource_name: &'static str,
}
@ -47,6 +50,16 @@ static COMMON_GUARDS: &[GuardRule] = &[
matchers: &["url_encode", "encode_uri", "urlencode"],
applies_to_sink_caps: Cap::URL_ENCODE,
},
GuardRule {
matchers: &[
"which",
"resolve_binary",
"find_program",
"lookup_path",
"shutil.which",
],
applies_to_sink_caps: Cap::SHELL_ESCAPE,
},
];
pub fn guard_rules(_lang: Lang) -> &'static [GuardRule] {
@ -168,21 +181,25 @@ static C_RESOURCES: &[ResourcePair] = &[
ResourcePair {
acquire: &["malloc", "calloc", "realloc"],
release: &["free"],
exclude_acquire: &[],
resource_name: "memory",
},
ResourcePair {
acquire: &["fopen"],
release: &["fclose"],
acquire: &["fopen", "fdopen", "curlx_fopen", "curlx_fdopen"],
release: &["fclose", "curlx_fclose"],
exclude_acquire: &["freopen", "curlx_freopen"],
resource_name: "file handle",
},
ResourcePair {
acquire: &["open"],
release: &["close"],
exclude_acquire: &["freopen", "curlx_freopen"],
resource_name: "file descriptor",
},
ResourcePair {
acquire: &["pthread_mutex_lock"],
release: &["pthread_mutex_unlock"],
exclude_acquire: &[],
resource_name: "mutex",
},
];
@ -191,11 +208,13 @@ static GO_RESOURCES: &[ResourcePair] = &[
ResourcePair {
acquire: &["os.Open", "os.Create", "os.OpenFile"],
release: &[".Close"],
exclude_acquire: &[],
resource_name: "file handle",
},
ResourcePair {
acquire: &[".Lock"],
release: &[".Unlock"],
exclude_acquire: &[],
resource_name: "mutex",
},
];
@ -205,6 +224,7 @@ static RUST_RESOURCES: &[ResourcePair] = &[
ResourcePair {
acquire: &["alloc"],
release: &["dealloc"],
exclude_acquire: &[],
resource_name: "raw memory",
},
];
@ -217,10 +237,93 @@ static JAVA_RESOURCES: &[ResourcePair] = &[ResourcePair {
"openConnection",
],
release: &[".close"],
exclude_acquire: &[],
resource_name: "stream/connection",
}];
static EMPTY_RESOURCES: &[ResourcePair] = &[];
static PYTHON_RESOURCES: &[ResourcePair] = &[
ResourcePair {
acquire: &["open"],
release: &[".close"],
exclude_acquire: &[],
resource_name: "file handle",
},
ResourcePair {
acquire: &["socket.socket", "socket"],
release: &[".close"],
exclude_acquire: &[],
resource_name: "socket",
},
ResourcePair {
acquire: &["connect", "cursor"],
release: &[".close"],
exclude_acquire: &["signal.connect", "event.connect", ".register"],
resource_name: "db connection",
},
ResourcePair {
acquire: &["threading.Lock", "threading.RLock"],
release: &[".release"],
exclude_acquire: &[],
resource_name: "mutex",
},
];
static RUBY_RESOURCES: &[ResourcePair] = &[
ResourcePair {
acquire: &["File.open", "open"],
release: &[".close"],
exclude_acquire: &[],
resource_name: "file handle",
},
ResourcePair {
acquire: &["TCPSocket.new", "UDPSocket.new"],
release: &[".close"],
exclude_acquire: &[],
resource_name: "socket",
},
ResourcePair {
acquire: &[".lock"],
release: &[".unlock"],
exclude_acquire: &[],
resource_name: "mutex",
},
];
static PHP_RESOURCES: &[ResourcePair] = &[
ResourcePair {
acquire: &["fopen"],
release: &["fclose"],
exclude_acquire: &["freopen"],
resource_name: "file handle",
},
ResourcePair {
acquire: &["mysqli_connect"],
release: &["mysqli_close"],
exclude_acquire: &[],
resource_name: "db connection",
},
ResourcePair {
acquire: &["curl_init"],
release: &["curl_close"],
exclude_acquire: &[],
resource_name: "curl handle",
},
];
static JS_RESOURCES: &[ResourcePair] = &[
ResourcePair {
acquire: &["fs.open", "fs.openSync"],
release: &["fs.close", "fs.closeSync"],
exclude_acquire: &[],
resource_name: "file descriptor",
},
ResourcePair {
acquire: &["createReadStream", "createWriteStream"],
release: &[".close", ".destroy"],
exclude_acquire: &[],
resource_name: "stream",
},
];
pub fn resource_pairs(lang: Lang) -> &'static [ResourcePair] {
match lang {
@ -229,6 +332,9 @@ pub fn resource_pairs(lang: Lang) -> &'static [ResourcePair] {
Lang::Go => GO_RESOURCES,
Lang::Rust => RUST_RESOURCES,
Lang::Java => JAVA_RESOURCES,
_ => EMPTY_RESOURCES,
Lang::Python => PYTHON_RESOURCES,
Lang::Ruby => RUBY_RESOURCES,
Lang::Php => PHP_RESOURCES,
Lang::JavaScript | Lang::TypeScript => JS_RESOURCES,
}
}

View file

@ -14,7 +14,7 @@ fn parse_and_analyse<A: CfgAnalysis>(
let mut parser = tree_sitter::Parser::new();
parser.set_language(&ts_lang).unwrap();
let tree = parser.parse(src, None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, lang_str, "test.rs");
let (cfg, entry, summaries) = build_cfg(&tree, src, lang_str, "test.rs", None);
let lang = Lang::from_slug(lang_str).unwrap();
let ctx = AnalysisContext {
cfg: &cfg,
@ -25,6 +25,8 @@ fn parse_and_analyse<A: CfgAnalysis>(
func_summaries: &summaries,
global_summaries: None,
taint_findings: &[],
analysis_rules: None,
taint_active: true,
};
analysis.run(&ctx)
}
@ -34,7 +36,7 @@ fn parse_and_run_all(src: &[u8], lang_str: &str, ts_lang: Language) -> Vec<CfgFi
let mut parser = tree_sitter::Parser::new();
parser.set_language(&ts_lang).unwrap();
let tree = parser.parse(src, None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, lang_str, "test.rs");
let (cfg, entry, summaries) = build_cfg(&tree, src, lang_str, "test.rs", None);
let lang = Lang::from_slug(lang_str).unwrap();
let ctx = AnalysisContext {
cfg: &cfg,
@ -45,6 +47,8 @@ fn parse_and_run_all(src: &[u8], lang_str: &str, ts_lang: Language) -> Vec<CfgFi
func_summaries: &summaries,
global_summaries: None,
taint_findings: &[],
analysis_rules: None,
taint_active: true,
};
run_all(&ctx)
}
@ -59,7 +63,7 @@ fn parse_and_run_all_with_taint(
let mut parser = tree_sitter::Parser::new();
parser.set_language(&ts_lang).unwrap();
let tree = parser.parse(src, None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, lang_str, "test.rs");
let (cfg, entry, summaries) = build_cfg(&tree, src, lang_str, "test.rs", None);
let lang = Lang::from_slug(lang_str).unwrap();
let ctx = AnalysisContext {
cfg: &cfg,
@ -70,6 +74,8 @@ fn parse_and_run_all_with_taint(
func_summaries: &summaries,
global_summaries: None,
taint_findings,
analysis_rules: None,
taint_active: true,
};
run_all(&ctx)
}
@ -144,7 +150,7 @@ fn unreachable_detects_orphaned_nodes() {
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, _) = build_cfg(&tree, src, "rust", "test.rs");
let (cfg, entry, _) = build_cfg(&tree, src, "rust", "test.rs", None);
// All nodes in linear code should be reachable
let reachable = dominators::reachable_set(&cfg, entry);
@ -469,7 +475,7 @@ fn reachable_set_contains_all_connected_nodes() {
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, _) = build_cfg(&tree, src, "rust", "test.rs");
let (cfg, entry, _) = build_cfg(&tree, src, "rust", "test.rs", None);
let reachable = dominators::reachable_set(&cfg, entry);
@ -493,7 +499,7 @@ fn find_exit_node_exists() {
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, _, _) = build_cfg(&tree, src, "rust", "test.rs");
let (cfg, _, _) = build_cfg(&tree, src, "rust", "test.rs", None);
let exit = dominators::find_exit_node(&cfg);
assert!(exit.is_some(), "Should find an exit node");
@ -512,7 +518,7 @@ fn shortest_distance_basic() {
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, _) = build_cfg(&tree, src, "rust", "test.rs");
let (cfg, entry, _) = build_cfg(&tree, src, "rust", "test.rs", None);
let exit = dominators::find_exit_node(&cfg).unwrap();
let dist = dominators::shortest_distance(&cfg, entry, exit);
@ -656,7 +662,7 @@ fn taint_and_unguarded_sink_deduped() {
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg_graph, entry, _summaries) = build_cfg(&tree, src, "rust", "test.rs");
let (cfg_graph, entry, _summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let _lang = Lang::from_slug("rust").unwrap();
// Find a sink node to create a synthetic taint finding
@ -674,6 +680,7 @@ fn taint_and_unguarded_sink_deduped() {
sink: sink_node,
source: entry,
path: vec![entry, sink_node],
source_kind: crate::labels::SourceKind::UserInput,
}];
let findings = parse_and_run_all_with_taint(
@ -719,3 +726,831 @@ fn process_star_without_web_params_no_auth_gap() {
auth_findings
);
}
// ─── Resource leak tests (additional languages) ────────────────────
#[test]
fn resource_leak_python_open_without_close() {
let src = br#"
def process():
f = open("data.txt")
data = f.read()
"#;
let findings = parse_and_analyse(
&resources::ResourceMisuse,
src,
"python",
Language::from(tree_sitter_python::LANGUAGE),
);
let leak_findings: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-resource-leak")
.collect();
assert!(
!leak_findings.is_empty(),
"Should detect open() without close() in Python"
);
}
#[test]
fn resource_leak_php_fopen_without_fclose() {
let src = br#"<?php
function read_file() {
$fp = fopen("data.txt", "r");
$data = fread($fp, 1024);
}
"#;
let findings = parse_and_analyse(
&resources::ResourceMisuse,
src,
"php",
Language::from(tree_sitter_php::LANGUAGE_PHP),
);
let leak_findings: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-resource-leak")
.collect();
assert!(
!leak_findings.is_empty(),
"Should detect fopen() without fclose() in PHP"
);
}
#[test]
fn resource_leak_js_open_without_close() {
let src = br#"
function readFile() {
var fd = fs.openSync("data.txt", "r");
var data = fs.readSync(fd, buf, 0, 100, 0);
}
"#;
let findings = parse_and_analyse(
&resources::ResourceMisuse,
src,
"javascript",
Language::from(tree_sitter_javascript::LANGUAGE),
);
let leak_findings: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-resource-leak")
.collect();
assert!(
!leak_findings.is_empty(),
"Should detect fs.openSync() without fs.closeSync() in JS"
);
}
// ─── JS CFG precision tests ────────────────────────────────────────
#[test]
fn js_throw_terminates_block() {
// throw should act as a terminator — code directly after throw in the same
// block should be unreachable.
let src = br#"
function fail() {
throw new Error("fatal");
eval("dead code");
}
"#;
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
let mut parser = tree_sitter::Parser::new();
parser.set_language(&ts_lang).unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, _) = build_cfg(&tree, src, "javascript", "test.js", None);
// Verify throw creates a Return-kind node
let throw_nodes: Vec<_> = cfg
.node_indices()
.filter(|&idx| {
cfg[idx].kind == crate::cfg::StmtKind::Return
&& cfg[idx].span.0 > 0
&& src[cfg[idx].span.0..].starts_with(b"throw")
})
.collect();
assert!(
!throw_nodes.is_empty(),
"throw statement should create a Return-kind node"
);
// eval after throw should be unreachable
let reachable = crate::cfg_analysis::dominators::reachable_set(&cfg, entry);
let eval_nodes: Vec<_> = cfg
.node_indices()
.filter(|&idx| cfg[idx].callee.as_deref().is_some_and(|c| c == "eval"))
.collect();
// eval might not even be in the CFG, or if it is, it should be unreachable
if !eval_nodes.is_empty() {
assert!(
eval_nodes.iter().all(|n| !reachable.contains(n)),
"eval after throw should be unreachable"
);
}
}
#[test]
fn configured_terminator_stops_flow() {
let src = br#"
function handler() {
process.exit(1);
eval("dangerous");
}
"#;
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
let rules = crate::labels::LangAnalysisRules {
extra_labels: vec![],
terminators: vec!["process.exit".into()],
event_handlers: vec![],
};
let mut parser = tree_sitter::Parser::new();
parser.set_language(&ts_lang).unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, _) = build_cfg(&tree, src, "javascript", "test.js", Some(&rules));
let reachable = crate::cfg_analysis::dominators::reachable_set(&cfg, entry);
// eval should be unreachable since process.exit is a terminator
let eval_nodes: Vec<_> = cfg
.node_indices()
.filter(|&idx| cfg[idx].callee.as_deref().is_some_and(|c| c == "eval"))
.collect();
if !eval_nodes.is_empty() {
assert!(
eval_nodes.iter().all(|n| !reachable.contains(n)),
"eval should be unreachable after process.exit terminator"
);
}
// If eval_nodes is empty it means the node wasn't created (also acceptable —
// it's after a terminator so the CFG may not even emit it)
}
// ─── Href classification tests ─────────────────────────────────────
#[test]
fn location_href_assignment_is_sink() {
let src = br#"
function redirect(url) {
location.href = url;
}
"#;
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
let mut parser = tree_sitter::Parser::new();
parser.set_language(&ts_lang).unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, _entry, _summaries) = build_cfg(&tree, src, "javascript", "test.js", None);
let has_sink = cfg
.node_indices()
.any(|idx| matches!(cfg[idx].label, Some(crate::labels::DataLabel::Sink(_))));
assert!(has_sink, "location.href = url should produce a Sink node");
}
#[test]
fn a_href_assignment_is_not_sink() {
let src = br#"
function setLink(el) {
el.href = "/about";
}
"#;
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
let mut parser = tree_sitter::Parser::new();
parser.set_language(&ts_lang).unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, _entry, _summaries) = build_cfg(&tree, src, "javascript", "test.js", None);
let has_sink = cfg
.node_indices()
.any(|idx| matches!(cfg[idx].label, Some(crate::labels::DataLabel::Sink(_))));
assert!(
!has_sink,
"el.href = '/about' should NOT produce a Sink node"
);
}
// ─── Config sanitizer tests ────────────────────────────────────────
#[test]
fn config_sanitizer_suppresses_unguarded_sink() {
// JS snippet: escapeHtml(x) before innerHTML = ... should not trigger
// cfg-unguarded-sink when escapeHtml is configured as a sanitizer.
let src = br#"
function render(input) {
var safe = escapeHtml(input);
document.body.innerHTML = safe;
}
"#;
let ts_lang = Language::from(tree_sitter_javascript::LANGUAGE);
let lang_str = "javascript";
// Build with config sanitizer rules
let rules = crate::labels::LangAnalysisRules {
extra_labels: vec![crate::labels::RuntimeLabelRule {
matchers: vec!["escapeHtml".into()],
label: crate::labels::DataLabel::Sanitizer(crate::labels::Cap::HTML_ESCAPE),
}],
terminators: vec![],
event_handlers: vec![],
};
let mut parser = tree_sitter::Parser::new();
parser.set_language(&ts_lang).unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, lang_str, "test.rs", Some(&rules));
let lang = Lang::from_slug(lang_str).unwrap();
let ctx = AnalysisContext {
cfg: &cfg,
entry,
lang,
file_path: "test.rs",
source_bytes: src,
func_summaries: &summaries,
global_summaries: None,
taint_findings: &[],
analysis_rules: Some(&rules),
taint_active: true,
};
let findings = run_all(&ctx);
let unguarded = findings
.iter()
.filter(|f| f.rule_id == "cfg-unguarded-sink")
.collect::<Vec<_>>();
assert!(
unguarded.is_empty(),
"escapeHtml config sanitizer should suppress cfg-unguarded-sink; got {:?}",
unguarded
);
}
// ─── Python precision tests ────────────────────────────────────────
#[test]
fn python_constant_subprocess_no_finding() {
// subprocess.run(["make","clean"]) with constant args should produce no finding
let src = br#"
import subprocess
def build():
subprocess.run(["make", "clean"])
"#;
let findings = parse_and_run_all(src, "python", Language::from(tree_sitter_python::LANGUAGE));
let unguarded: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-unguarded-sink")
.collect();
assert!(
unguarded.is_empty(),
"subprocess.run with constant list args should not be flagged; got {:?}",
unguarded
);
}
#[test]
fn python_constant_git_status_no_finding() {
let src = br#"
import subprocess
def check():
subprocess.run(["git", "status"])
"#;
let findings = parse_and_run_all(src, "python", Language::from(tree_sitter_python::LANGUAGE));
let unguarded: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-unguarded-sink")
.collect();
assert!(
unguarded.is_empty(),
"subprocess.run with constant git args should not be flagged; got {:?}",
unguarded
);
}
#[test]
fn python_tainted_os_system_produces_finding() {
// Source (sys.argv) flowing to os.system → should produce a finding
let src = br#"
import sys
import os
def run():
cmd = sys.argv[1]
os.system(cmd)
"#;
let findings = parse_and_run_all(src, "python", Language::from(tree_sitter_python::LANGUAGE));
let sink_findings: Vec<_> = findings
.iter()
.filter(|f| {
f.rule_id == "cfg-unguarded-sink" && f.severity == crate::patterns::Severity::High
})
.collect();
assert!(
!sink_findings.is_empty(),
"Source-derived os.system should produce a HIGH finding"
);
}
// ─── C++ precision tests ───────────────────────────────────────────
#[test]
fn cpp_cout_not_a_sink() {
let src = br#"
#include <iostream>
int main() {
std::cout << "hello" << std::endl;
return 0;
}
"#;
let findings = parse_and_run_all(src, "cpp", Language::from(tree_sitter_cpp::LANGUAGE));
let sink_findings: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-unguarded-sink")
.collect();
assert!(
sink_findings.is_empty(),
"std::cout should not produce an unguarded-sink finding; got {:?}",
sink_findings
);
}
#[test]
fn cpp_printf_constant_no_finding() {
// printf with constant args → FMT_STRING sink but constant-arg suppression
let src = br#"
#include <stdio.h>
int main() {
printf("hello\n");
return 0;
}
"#;
let findings = parse_and_run_all(src, "c", Language::from(tree_sitter_c::LANGUAGE));
let unguarded: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-unguarded-sink")
.collect();
assert!(
unguarded.is_empty(),
"printf with constant args should be suppressed; got {:?}",
unguarded
);
}
#[test]
fn cpp_system_with_getenv_produces_finding() {
let src = br#"
#include <stdlib.h>
int main() {
char* input = getenv("USER_CMD");
system(input);
return 0;
}
"#;
let findings = parse_and_run_all(src, "c", Language::from(tree_sitter_c::LANGUAGE));
let sink_findings: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-unguarded-sink")
.collect();
assert!(
!sink_findings.is_empty(),
"system(getenv(...)) should produce an unguarded-sink finding"
);
}
// ─── Unreachable + unguarded dedup test ─────────────────────────────
#[test]
fn unreachable_sink_suppresses_unguarded() {
// If a sink is in unreachable code, only cfg-unreachable-sink should fire,
// NOT also cfg-unguarded-sink.
let src = br#"
fn main() {
return;
std::process::Command::new("sh").arg("x").status().unwrap();
}
"#;
let findings = parse_and_run_all(src, "rust", Language::from(tree_sitter_rust::LANGUAGE));
let unreachable: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-unreachable-sink")
.collect();
let unguarded_at_same_span: Vec<_> = findings
.iter()
.filter(|f| {
f.rule_id == "cfg-unguarded-sink" && unreachable.iter().any(|u| u.span == f.span)
})
.collect();
assert!(
unguarded_at_same_span.is_empty(),
"cfg-unguarded-sink should be suppressed when cfg-unreachable-sink fires on same span; got {:?}",
unguarded_at_same_span
);
}
// ─── Fix 3: Wrapper resource names (curlx_fopen/curlx_fclose) ──────
#[test]
fn curlx_fopen_with_curlx_fclose_no_leak() {
let src = br#"
void process() {
FILE *fp = curlx_fopen("file.txt", "r");
curlx_fclose(fp);
}
"#;
let findings = parse_and_analyse(
&resources::ResourceMisuse,
src,
"c",
Language::from(tree_sitter_c::LANGUAGE),
);
let leak_findings: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-resource-leak")
.collect();
assert!(
leak_findings.is_empty(),
"curlx_fopen + curlx_fclose should not produce a resource leak; got {:?}",
leak_findings
);
}
// ─── Fix 4: freopen exclusion ───────────────────────────────────────
#[test]
fn freopen_not_treated_as_acquire() {
let src = br#"
void redirect_stderr() {
freopen("/dev/null", "w", stderr);
}
"#;
let findings = parse_and_analyse(
&resources::ResourceMisuse,
src,
"c",
Language::from(tree_sitter_c::LANGUAGE),
);
let leak_findings: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-resource-leak")
.collect();
assert!(
leak_findings.is_empty(),
"freopen should not produce a resource leak finding; got {:?}",
leak_findings
);
}
// ─── Fix 5: Struct field ownership transfer ─────────────────────────
#[test]
fn struct_field_ownership_transfer_no_leak() {
let src = br#"
void open_stream(struct session *s) {
FILE *fp = fopen("data.txt", "r");
s->stream = fp;
s->fopened = 1;
}
"#;
let findings = parse_and_analyse(
&resources::ResourceMisuse,
src,
"c",
Language::from(tree_sitter_c::LANGUAGE),
);
let leak_findings: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-resource-leak")
.collect();
assert!(
leak_findings.is_empty(),
"Struct field ownership transfer should suppress resource leak; got {:?}",
leak_findings
);
}
// ─── Fix 6: Linked-list / global insertion ──────────────────────────
#[test]
fn linked_list_insertion_no_leak() {
let src = br#"
void add_var(struct config *cfg, const char *name) {
struct var *p = malloc(sizeof(struct var));
p->next = cfg->variables;
cfg->variables = p;
}
"#;
let findings = parse_and_analyse(
&resources::ResourceMisuse,
src,
"c",
Language::from(tree_sitter_c::LANGUAGE),
);
let leak_findings: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-resource-leak")
.collect();
assert!(
leak_findings.is_empty(),
"Linked-list insertion should suppress resource leak; got {:?}",
leak_findings
);
}
// ─── Fix 2: Preproc dangling-else CFG recovery ─────────────────────
#[test]
fn preproc_ifdef_does_not_orphan_subsequent_code() {
// After a #ifdef block containing an if/else, subsequent code should
// still be reachable (no unreachable findings).
let src = br#"
void process() {
int x = 1;
#ifdef _WIN32
if (x) {
x = 2;
} else
#endif
{
x = 3;
}
free(x);
}
"#;
let ts_lang = Language::from(tree_sitter_c::LANGUAGE);
let mut parser = tree_sitter::Parser::new();
parser.set_language(&ts_lang).unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, _) = build_cfg(&tree, src, "c", "test.c", None);
let reachable = dominators::reachable_set(&cfg, entry);
// All nodes should be reachable — the preproc recovery should prevent
// the dangling-else from orphaning downstream code.
let unreachable_count = cfg.node_count() - reachable.len();
assert!(
unreachable_count == 0,
"Expected all nodes reachable after preproc block, but {} nodes are unreachable",
unreachable_count
);
}
// ─── Fix 1: Break in loop keeps post-loop code reachable ────────────
#[test]
fn break_in_loop_post_loop_reachable() {
let src = br#"
void process() {
int x = 0;
while(1) {
if(x) break;
x = x + 1;
}
free(x);
}
"#;
let ts_lang = Language::from(tree_sitter_c::LANGUAGE);
let mut parser = tree_sitter::Parser::new();
parser.set_language(&ts_lang).unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, _) = build_cfg(&tree, src, "c", "test.c", None);
let reachable = dominators::reachable_set(&cfg, entry);
// All nodes should be reachable — break exits the loop and post-loop
// code (free(x)) should be connected.
let unreachable_count = cfg.node_count() - reachable.len();
assert!(
unreachable_count == 0,
"Expected all nodes reachable after break in loop, but {} nodes are unreachable",
unreachable_count
);
}
// ─── PART 2A: One-hop constant binding trace ────────────────────────
#[test]
fn python_one_hop_constant_binding_no_finding() {
// cmd = "git"; subprocess.run([cmd, "status"]) → no finding
let src = br#"
import subprocess
def check():
cmd = "git"
subprocess.run([cmd, "status"])
"#;
let findings = parse_and_run_all(src, "python", Language::from(tree_sitter_python::LANGUAGE));
let unguarded: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-unguarded-sink")
.collect();
assert!(
unguarded.is_empty(),
"One-hop constant binding should suppress cfg-unguarded-sink; got {:?}",
unguarded
);
}
// ─── PART 2B: Exec-path guard rules ─────────────────────────────────
#[test]
fn exec_path_guard_suppresses_unguarded_sink() {
// resolve_binary(&bin); Command::new(bin); → no finding
let src = br#"
use std::process::Command;
fn main() {
let bin = std::env::var("BIN").unwrap();
resolve_binary(&bin);
Command::new("sh").arg(&bin).status().unwrap();
}"#;
let findings = parse_and_analyse(
&guards::UnguardedSink,
src,
"rust",
Language::from(tree_sitter_rust::LANGUAGE),
);
let unguarded: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-unguarded-sink")
.collect();
assert!(
unguarded.is_empty(),
"resolve_binary guard should suppress cfg-unguarded-sink; got {:?}",
unguarded
);
}
// ─── PART 2C: Evidence-based severity in cfg-only mode ──────────────
#[test]
fn cfg_only_no_taint_produces_low_severity() {
// In cfg-only mode (taint_active=false) with no source-derived evidence,
// unguarded sink should produce LOW severity instead of MEDIUM.
let src = br#"
use std::process::Command;
fn process_data() {
let x = compute_something();
Command::new("sh").arg(&x).status().unwrap();
}"#;
let ts_lang = Language::from(tree_sitter_rust::LANGUAGE);
let mut parser = tree_sitter::Parser::new();
parser.set_language(&ts_lang).unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let lang = Lang::from_slug("rust").unwrap();
let ctx = AnalysisContext {
cfg: &cfg,
entry,
lang,
file_path: "test.rs",
source_bytes: src,
func_summaries: &summaries,
global_summaries: None,
taint_findings: &[],
analysis_rules: None,
taint_active: false, // cfg-only mode
};
let findings = guards::UnguardedSink.run(&ctx);
let medium_or_high: Vec<_> = findings
.iter()
.filter(|f| {
f.rule_id == "cfg-unguarded-sink"
&& (f.severity == crate::patterns::Severity::Medium
|| f.severity == crate::patterns::Severity::High)
})
.collect();
assert!(
medium_or_high.is_empty(),
"cfg-only mode without taint should produce LOW severity, not MEDIUM/HIGH; got {:?}",
medium_or_high
);
}
// ─── PART 4B: FileResponse ownership transfer ──────────────────────
#[test]
fn file_response_ownership_transfer_no_leak() {
let src = br#"
def serve_file():
f = open("report.pdf", "rb")
return FileResponse(f)
"#;
let findings = parse_and_analyse(
&resources::ResourceMisuse,
src,
"python",
Language::from(tree_sitter_python::LANGUAGE),
);
let leak_findings: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-resource-leak")
.collect();
assert!(
leak_findings.is_empty(),
"FileResponse should suppress cfg-resource-leak; got {:?}",
leak_findings
);
}
// ─── PART 4C: Lock-not-released refinement ──────────────────────────
#[test]
fn python_lock_constructor_only_no_finding() {
// threading.Lock() without .acquire() → no finding
let src = br#"
import threading
def setup():
lock = threading.Lock()
do_work()
"#;
let findings = parse_and_analyse(
&resources::ResourceMisuse,
src,
"python",
Language::from(tree_sitter_python::LANGUAGE),
);
let lock_findings: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-lock-not-released")
.collect();
assert!(
lock_findings.is_empty(),
"Lock constructor without acquire should not produce cfg-lock-not-released; got {:?}",
lock_findings
);
}
// ─── PART 4A: signal.connect exclusion ──────────────────────────────
#[test]
fn python_signal_connect_not_treated_as_db_acquire() {
let src = br#"
def setup():
signal.connect(handler)
do_work()
"#;
let findings = parse_and_analyse(
&resources::ResourceMisuse,
src,
"python",
Language::from(tree_sitter_python::LANGUAGE),
);
let leak_findings: Vec<_> = findings
.iter()
.filter(|f| f.rule_id == "cfg-resource-leak")
.collect();
assert!(
leak_findings.is_empty(),
"signal.connect should not be treated as db acquire; got {:?}",
leak_findings
);
}

View file

@ -3,9 +3,40 @@ use super::{AnalysisContext, CfgAnalysis, CfgFinding, Confidence};
use crate::cfg::StmtKind;
use crate::labels::DataLabel;
use crate::patterns::Severity;
use std::collections::HashSet;
pub struct UnreachableCode;
/// Collect function names that appear as arguments to configured event handler calls.
fn event_handler_callbacks(ctx: &AnalysisContext) -> HashSet<String> {
let mut callbacks = HashSet::new();
let handlers = match ctx.analysis_rules {
Some(rules) if !rules.event_handlers.is_empty() => &rules.event_handlers,
_ => return callbacks,
};
for idx in ctx.cfg.node_indices() {
let info = &ctx.cfg[idx];
if info.kind != StmtKind::Call {
continue;
}
if let Some(callee) = &info.callee {
let callee_lower = callee.to_ascii_lowercase();
let is_handler = handlers
.iter()
.any(|h| callee_lower.ends_with(&h.to_ascii_lowercase()));
if is_handler {
// The callback function is typically used within the call — any function
// that appears as `uses` of this call node is a potential callback.
for u in &info.uses {
callbacks.insert(u.clone());
}
}
}
}
callbacks
}
impl CfgAnalysis for UnreachableCode {
fn name(&self) -> &'static str {
"unreachable-code"
@ -13,6 +44,7 @@ impl CfgAnalysis for UnreachableCode {
fn run(&self, ctx: &AnalysisContext) -> Vec<CfgFinding> {
let reachable = dominators::reachable_set(ctx.cfg, ctx.entry);
let handler_callbacks = event_handler_callbacks(ctx);
let mut findings = Vec::new();
for idx in ctx.cfg.node_indices() {
@ -27,6 +59,13 @@ impl CfgAnalysis for UnreachableCode {
continue;
}
// Suppress findings for nodes inside event handler callbacks
if let Some(func_name) = &info.enclosing_func
&& handler_callbacks.contains(func_name)
{
continue;
}
let (rule_id, title, severity) = match info.label {
Some(DataLabel::Sanitizer(_)) => (
"cfg-unreachable-sanitizer",
@ -43,7 +82,9 @@ impl CfgAnalysis for UnreachableCode {
),
_ => {
// Check if it's a guard/auth call
if super::is_guard_call(info, ctx.lang) || super::is_auth_call(info, ctx.lang) {
if super::is_guard_call(info, ctx.lang, ctx.analysis_rules)
|| super::is_auth_call(info, ctx.lang)
{
(
"cfg-unreachable-guard",
"Unreachable guard/auth check",

View file

@ -9,6 +9,14 @@ pub struct Cli {
pub(crate) command: Commands,
}
impl Commands {
/// Whether this command produces structured (machine-readable) output on
/// stdout, meaning human status messages must be suppressed entirely.
pub fn is_structured_output(&self) -> bool {
matches!(self, Commands::Scan { format, .. } if format == "json" || format == "sarif")
}
}
#[derive(Subcommand)]
pub enum Commands {
/// Scan project for vulnerabilities
@ -25,8 +33,8 @@ pub enum Commands {
#[arg(long)]
rebuild_index: bool,
/// Output format
#[arg(short, long, value_enum, default_value = "")]
/// Output format (console, json, sarif)
#[arg(short, long, default_value = "")]
format: String,
/// Show only high severity issues
@ -41,6 +49,11 @@ pub enum Commands {
#[arg(long)]
all_targets: bool,
/// Include findings from test/vendor/build paths at original severity
/// (by default these are downgraded)
#[arg(long)]
include_nonprod: bool,
},
/// Manage project indexes
@ -65,6 +78,51 @@ pub enum Commands {
#[arg(long)]
all: bool,
},
/// Manage analysis configuration
Config {
#[command(subcommand)]
action: ConfigAction,
},
}
#[derive(Subcommand)]
pub enum ConfigAction {
/// Print effective merged configuration as TOML
Show,
/// Print configuration directory path
Path,
/// Add a label rule to nyx.local
AddRule {
/// Language slug (e.g. javascript, rust, python)
#[arg(long)]
lang: String,
/// Function or property name to match
#[arg(long)]
matcher: String,
/// Rule kind: source, sanitizer, or sink
#[arg(long)]
kind: String,
/// Capability: env_var, html_escape, shell_escape, url_encode, json_parse, file_io, or all
#[arg(long)]
cap: String,
},
/// Add a terminator function to nyx.local
AddTerminator {
/// Language slug (e.g. javascript, rust, python)
#[arg(long)]
lang: String,
/// Function name that terminates execution (e.g. process.exit)
#[arg(long)]
name: String,
},
}
#[derive(Subcommand)]

213
src/commands/config.rs Normal file
View file

@ -0,0 +1,213 @@
use crate::errors::NyxResult;
use crate::utils::config::{AnalysisRulesConfig, Config, ConfigLabelRule};
use console::style;
use std::fs;
use std::path::Path;
/// Show the effective merged configuration as TOML.
pub fn show(config: &Config) -> NyxResult<()> {
let toml_str =
toml::to_string_pretty(config).map_err(|e| format!("Failed to serialize config: {e}"))?;
println!("{toml_str}");
Ok(())
}
/// Print the configuration directory path.
pub fn path(config_dir: &Path) -> NyxResult<()> {
println!("{}", config_dir.display());
Ok(())
}
/// Add a label rule to `nyx.local`.
pub fn add_rule(
config_dir: &Path,
lang: &str,
matcher: &str,
kind: &str,
cap: &str,
) -> NyxResult<()> {
// Validate kind
if !["source", "sanitizer", "sink"].contains(&kind) {
return Err(
format!("Invalid kind '{kind}'. Must be one of: source, sanitizer, sink").into(),
);
}
// Validate cap
if crate::labels::parse_cap(cap).is_none() {
return Err(format!(
"Invalid cap '{cap}'. Must be one of: env_var, html_escape, shell_escape, url_encode, json_parse, file_io, all"
)
.into());
}
let local_path = config_dir.join("nyx.local");
let mut config: Config = if local_path.exists() {
let content = fs::read_to_string(&local_path)?;
toml::from_str(&content)?
} else {
Config::default()
};
let lang_cfg = config
.analysis
.languages
.entry(lang.to_string())
.or_default();
let new_rule = ConfigLabelRule {
matchers: vec![matcher.to_string()],
kind: kind.to_string(),
cap: cap.to_string(),
};
// Dedup
if !lang_cfg.rules.contains(&new_rule) {
lang_cfg.rules.push(new_rule);
}
write_local_config(&local_path, &config)?;
println!(
"{}: Added {} rule for `{}` ({}) in {}",
style("ok").green().bold(),
kind,
matcher,
cap,
lang
);
Ok(())
}
/// Add a terminator to `nyx.local`.
pub fn add_terminator(config_dir: &Path, lang: &str, name: &str) -> NyxResult<()> {
let local_path = config_dir.join("nyx.local");
let mut config: Config = if local_path.exists() {
let content = fs::read_to_string(&local_path)?;
toml::from_str(&content)?
} else {
Config::default()
};
let lang_cfg = config
.analysis
.languages
.entry(lang.to_string())
.or_default();
if !lang_cfg.terminators.contains(&name.to_string()) {
lang_cfg.terminators.push(name.to_string());
}
write_local_config(&local_path, &config)?;
println!(
"{}: Added terminator `{}` for {}",
style("ok").green().bold(),
name,
lang
);
Ok(())
}
/// Write only the non-default portions to nyx.local.
fn write_local_config(path: &Path, config: &Config) -> NyxResult<()> {
// Only write the analysis section to nyx.local to keep it minimal.
// Other settings keep their defaults unless previously customized.
let mut local = Config {
analysis: config.analysis.clone(),
..Config::default()
};
// Strip empty language entries
local.analysis.languages.retain(|_, v| {
!v.rules.is_empty() || !v.terminators.is_empty() || !v.event_handlers.is_empty()
});
// If no analysis rules, only write the analysis section
if local.analysis.languages.is_empty() {
local.analysis = AnalysisRulesConfig::default();
}
let toml_str =
toml::to_string_pretty(&local).map_err(|e| format!("Failed to serialize config: {e}"))?;
fs::write(path, toml_str)?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn add_rule_writes_valid_toml() {
let dir = tempfile::tempdir().unwrap();
add_rule(
dir.path(),
"javascript",
"escapeHtml",
"sanitizer",
"html_escape",
)
.unwrap();
let content = fs::read_to_string(dir.path().join("nyx.local")).unwrap();
let config: Config = toml::from_str(&content).unwrap();
let js = config.analysis.languages.get("javascript").unwrap();
assert_eq!(js.rules.len(), 1);
assert_eq!(js.rules[0].matchers, vec!["escapeHtml"]);
assert_eq!(js.rules[0].kind, "sanitizer");
assert_eq!(js.rules[0].cap, "html_escape");
}
#[test]
fn add_rule_deduplicates() {
let dir = tempfile::tempdir().unwrap();
add_rule(
dir.path(),
"javascript",
"escapeHtml",
"sanitizer",
"html_escape",
)
.unwrap();
add_rule(
dir.path(),
"javascript",
"escapeHtml",
"sanitizer",
"html_escape",
)
.unwrap();
let content = fs::read_to_string(dir.path().join("nyx.local")).unwrap();
let config: Config = toml::from_str(&content).unwrap();
let js = config.analysis.languages.get("javascript").unwrap();
assert_eq!(js.rules.len(), 1);
}
#[test]
fn add_terminator_works() {
let dir = tempfile::tempdir().unwrap();
add_terminator(dir.path(), "javascript", "process.exit").unwrap();
let content = fs::read_to_string(dir.path().join("nyx.local")).unwrap();
let config: Config = toml::from_str(&content).unwrap();
let js = config.analysis.languages.get("javascript").unwrap();
assert_eq!(js.terminators, vec!["process.exit"]);
}
#[test]
fn add_rule_rejects_invalid_kind() {
let dir = tempfile::tempdir().unwrap();
let result = add_rule(dir.path(), "javascript", "foo", "invalid_kind", "all");
assert!(result.is_err());
}
#[test]
fn add_rule_rejects_invalid_cap() {
let dir = tempfile::tempdir().unwrap();
let result = add_rule(dir.path(), "javascript", "foo", "sanitizer", "invalid_cap");
assert!(result.is_err());
}
}

View file

@ -5,10 +5,10 @@ use crate::patterns::Severity;
use crate::utils::Config;
use crate::utils::project::get_project_info;
use crate::walk::spawn_file_walker;
use blake3;
use bytesize::ByteSize;
use chrono::{DateTime, Local};
use console::style;
use indicatif::{ProgressBar, ProgressStyle};
use rayon::prelude::*;
use std::fs;
use std::path::PathBuf;
@ -25,7 +25,13 @@ pub fn handle(
let (project_name, db_path) = get_project_info(&build_path, database_dir)?;
if force || !db_path.exists() {
build_index(&project_name, &build_path, &db_path, config)?;
build_index(
&project_name,
&build_path,
&db_path,
config,
!config.output.quiet,
)?;
println!(
"✔ {} {}",
style("Index built:").green(),
@ -84,6 +90,7 @@ pub fn build_index(
project_path: &std::path::Path,
db_path: &std::path::Path,
config: &Config,
show_progress: bool,
) -> NyxResult<()> {
tracing::debug!("Building index for: {}", project_name);
fs::File::create(db_path)?;
@ -97,10 +104,27 @@ pub fn build_index(
tracing::debug!("Cleaned index for: {}", project_name);
let (rx, handle) = spawn_file_walker(project_path, config);
// Drain the channel BEFORE joining — the bounded channel will deadlock
// if we join first and the walker blocks on send.
let paths: Vec<PathBuf> = rx.into_iter().flatten().collect();
if let Err(err) = handle.join() {
tracing::error!("walker thread panicked: {:#?}", err);
}
let paths: Vec<PathBuf> = rx.into_iter().flatten().collect();
let pb = if show_progress {
let pb = ProgressBar::new(paths.len() as u64);
pb.set_style(
ProgressStyle::with_template(
"{spinner:.green} {msg} [{bar:30.cyan/blue}] {pos}/{len} ({eta})",
)
.unwrap()
.progress_chars("##-"),
);
pb.set_message("Indexing files");
pb
} else {
ProgressBar::hidden()
};
paths
.into_par_iter()
@ -108,18 +132,15 @@ pub fn build_index(
let mut idx = Indexer::from_pool(project_name, &pool)?;
// Read once, hash once — pass bytes to both rule execution and
// summary extraction.
// summary extraction. Use pre-computed hash for upsert to avoid
// a redundant file read inside upsert_file.
let bytes = std::fs::read(&path)?;
let hash = {
let mut hasher = blake3::Hasher::new();
hasher.update(&bytes);
hasher.finalize().as_bytes().to_vec()
};
let hash = Indexer::digest_bytes(&bytes);
// Run AST-only rules (no taint yet — summaries come later in scan)
let issues =
crate::commands::scan::run_rules_on_bytes(&bytes, &path, config, None, None)?;
let file_id = idx.upsert_file(&path)?;
let file_id = idx.upsert_file_with_hash(&path, &hash)?;
let rows: Vec<IssueRow> = issues
.iter()
@ -144,8 +165,10 @@ pub fn build_index(
idx.replace_summaries_for_file(&path, &hash, &sums)?;
}
pb.inc(1);
Ok(())
})?;
pb.finish_and_clear();
{
let idx = Indexer::from_pool(project_name, &pool)?;
@ -170,7 +193,7 @@ fn build_index_creates_db_and_registers_files() {
let db_path = td.path().join("proj.sqlite");
build_index("proj", &project_dir, &db_path, &cfg).expect("index build should succeed");
build_index("proj", &project_dir, &db_path, &cfg, false).expect("index build should succeed");
// ── Assert ────────────────────────────────────────────────────────────────
assert!(db_path.is_file(), "SQLite file must exist");

View file

@ -1,4 +1,5 @@
pub mod clean;
pub mod config;
pub mod index;
pub mod list;
pub mod scan;
@ -12,6 +13,7 @@ use std::path::Path;
pub fn handle_command(
command: Commands,
database_dir: &Path,
config_dir: &Path,
config: &mut Config,
) -> NyxResult<()> {
match command {
@ -24,6 +26,7 @@ pub fn handle_command(
ast_only,
cfg_only,
all_targets,
include_nonprod,
} => {
if high_only {
config.scanner.min_severity = Severity::High
@ -41,10 +44,37 @@ pub fn handle_command(
config.scanner.mode = AnalysisMode::Full
};
scan::handle(&path, no_index, rebuild_index, format, database_dir, config)
if include_nonprod {
config.scanner.include_nonprod = true
};
scan::handle(&path, no_index, rebuild_index, format, database_dir, config)?;
}
Commands::Index { action } => {
index::handle(action, database_dir, config)?;
}
Commands::List { verbose } => {
list::handle(verbose, database_dir)?;
}
Commands::Clean { project, all } => {
clean::handle(project, all, database_dir)?;
}
Commands::Config { action } => {
use crate::cli::ConfigAction;
match action {
ConfigAction::Show => self::config::show(config)?,
ConfigAction::Path => self::config::path(config_dir)?,
ConfigAction::AddRule {
lang,
matcher,
kind,
cap,
} => self::config::add_rule(config_dir, &lang, &matcher, &kind, &cap)?,
ConfigAction::AddTerminator { lang, name } => {
self::config::add_terminator(config_dir, &lang, &name)?
}
}
}
Commands::Index { action } => index::handle(action, database_dir, config),
Commands::List { verbose } => list::handle(verbose, database_dir),
Commands::Clean { project, all } => clean::handle(project, all, database_dir),
}
Ok(())
}

View file

@ -1,16 +1,16 @@
pub(crate) use crate::ast::{
extract_summaries_from_bytes, extract_summaries_from_file, run_rules_on_bytes,
run_rules_on_file,
analyse_file_fused, extract_summaries_from_bytes, run_rules_on_bytes, run_rules_on_file,
};
use crate::database::index::{Indexer, IssueRow};
use crate::errors::NyxResult;
use crate::patterns::Severity;
use crate::summary::{self, FuncSummary, GlobalSummaries};
use crate::summary::{self, GlobalSummaries};
use crate::utils::config::Config;
use crate::utils::project::get_project_info;
use crate::walk::spawn_file_walker;
use console::style;
use dashmap::DashMap;
use indicatif::{ProgressBar, ProgressStyle};
use r2d2::Pool;
use r2d2_sqlite::SqliteConnectionManager;
use rayon::prelude::*;
@ -18,6 +18,22 @@ use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
use std::sync::Arc;
fn make_progress_bar(len: u64, msg: &str, show: bool) -> ProgressBar {
if !show {
return ProgressBar::hidden();
}
let pb = ProgressBar::new(len);
pb.set_style(
ProgressStyle::with_template(
"{spinner:.green} {msg} [{bar:30.cyan/blue}] {pos}/{len} ({eta})",
)
.unwrap()
.progress_chars("##-"),
);
pb.set_message(msg.to_string());
pb
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct Diag {
pub path: String,
@ -39,22 +55,37 @@ pub fn handle(
let scan_path = Path::new(path).canonicalize()?;
let (project_name, db_path) = get_project_info(&scan_path, database_dir)?;
println!(
"{} {}...\n",
style("Checking").green().bold(),
&project_name
);
let suppress_status = config.output.quiet || format == "json" || format == "sarif";
if !suppress_status {
println!(
"{} {}...\n",
style("Checking").green().bold(),
&project_name
);
}
let show_progress = format != "json" && format != "sarif" && !config.output.quiet;
let diags: Vec<Diag> = if no_index {
scan_filesystem(&scan_path, config)?
scan_filesystem(&scan_path, config, show_progress)?
} else {
if rebuild_index || !db_path.exists() {
tracing::debug!("Scanning filesystem index filesystem");
crate::commands::index::build_index(&project_name, &scan_path, &db_path, config)?;
crate::commands::index::build_index(
&project_name,
&scan_path,
&db_path,
config,
show_progress,
)?;
}
let pool = Indexer::init(&db_path)?;
scan_with_index_parallel(&project_name, pool, config)?
if config.database.vacuum_on_startup {
let idx = Indexer::from_pool(&project_name, &pool)?;
idx.vacuum()?;
}
scan_with_index_parallel(&project_name, pool, config, show_progress)?
};
tracing::debug!("Found {:?} issues.", diags.len());
@ -66,6 +97,14 @@ pub fn handle(
return Ok(());
}
if format == "sarif" {
let sarif = crate::output::build_sarif(&diags, &scan_path);
let json = serde_json::to_string_pretty(&sarif)
.map_err(|e| crate::errors::NyxError::Msg(e.to_string()))?;
println!("{json}");
return Ok(());
}
if format == "console" || (format.is_empty() && config.output.default_format == "console") {
tracing::debug!("Printing to console");
let mut grouped: BTreeMap<&str, Vec<&Diag>> = BTreeMap::new();
@ -77,10 +116,10 @@ pub fn handle(
println!("{}", style(path).blue().underlined());
for d in issues {
println!(
" {:>4}:{:<4} [{:}] {:}",
" {:>4}:{:<4} {} {}",
d.line,
d.col,
d.severity,
d.severity.colored_tag(),
style(&d.id).bold()
);
}
@ -109,55 +148,144 @@ pub fn handle(
/// merged crossfile summaries.
///
/// AST pattern queries are run during pass 2 (they don't depend on summaries).
pub(crate) fn scan_filesystem(root: &Path, cfg: &Config) -> NyxResult<Vec<Diag>> {
pub(crate) fn scan_filesystem(
root: &Path,
cfg: &Config,
show_progress: bool,
) -> NyxResult<Vec<Diag>> {
// ── Collect file list ────────────────────────────────────────────────
let all_paths: Vec<PathBuf> = {
let _span = tracing::info_span!("walk_files").entered();
let (rx, handle) = spawn_file_walker(root, cfg);
// Drain the channel BEFORE joining the walker thread.
// The channel is bounded, so joining first would deadlock once
// the walker fills it and blocks on send.
let paths: Vec<PathBuf> = rx.into_iter().flatten().collect();
if let Err(err) = handle.join() {
tracing::error!("walker thread panicked: {:#?}", err);
}
rx.into_iter().flatten().collect()
paths
};
tracing::info!(file_count = all_paths.len(), "file walk complete");
// ── Pass 1: extract summaries ────────────────────────────────────────
let needs_taint = cfg.scanner.mode == crate::utils::config::AnalysisMode::Full
|| cfg.scanner.mode == crate::utils::config::AnalysisMode::Taint;
let global_summaries: Option<GlobalSummaries> = if needs_taint {
let _span = tracing::info_span!("pass1_summaries", files = all_paths.len()).entered();
if !needs_taint {
// ── AST-only: single fused pass (no cross-file context needed) ──
let _span = tracing::info_span!("ast_only_analysis", files = all_paths.len()).entered();
let pb = make_progress_bar(all_paths.len() as u64, "Running analysis", show_progress);
let collected: Vec<FuncSummary> = all_paths
let mut diags: Vec<Diag> = all_paths
.par_iter()
.flat_map_iter(|path| match extract_summaries_from_file(path, cfg) {
Ok(sums) => sums,
Err(e) => {
tracing::warn!("pass 1: failed to summarise {}: {e}", path.display());
vec![]
}
.flat_map_iter(|path| {
let result = match analyse_file_fused(
&std::fs::read(path).unwrap_or_default(),
path,
cfg,
None,
Some(root),
) {
Ok(r) => r.diags,
Err(e) => {
tracing::warn!("analysis: {}: {e}", path.display());
vec![]
}
};
pb.inc(1);
result
})
.collect();
pb.finish_and_clear();
tracing::info!(summaries = collected.len(), "pass 1 complete");
let _merge_span = tracing::info_span!("merge_summaries").entered();
if let Some(max) = cfg.output.max_results {
diags.truncate(max as usize);
}
return Ok(diags);
}
// ── Taint mode: two-pass with fused pass 1 ──────────────────────────
//
// Pass 1 (fused): parse + CFG (once!) → extract summaries + run
// AST queries + local taint + CFG structural analyses.
// Summaries are collected for the cross-file merge.
//
// Pass 2: re-run full analysis with global summaries injected.
// This requires a second parse+CFG, but ONLY for taint-mode files
// that need cross-file context. For repos where most functions
// don't have unresolved callees, pass 1 results are already correct.
// ── Pass 1: fused summary extraction + parallel merge ──────────────
//
// Each rayon thread builds a local `GlobalSummaries` from its chunk,
// then the per-thread maps are merged in a binary reduce tree.
// This eliminates the serial merge_summaries bottleneck.
let global_summaries: GlobalSummaries = {
let _span = tracing::info_span!("pass1_fused", files = all_paths.len()).entered();
let pb = make_progress_bar(
all_paths.len() as u64,
"Pass 1: Extracting summaries",
show_progress,
);
let root_str = root.to_string_lossy();
Some(summary::merge_summaries(collected, Some(&root_str)))
} else {
None
let gs = all_paths
.par_iter()
.fold(GlobalSummaries::new, |mut local_gs, path| {
if let Ok(bytes) = std::fs::read(path) {
match analyse_file_fused(&bytes, path, cfg, None, Some(root)) {
Ok(r) => {
for s in r.summaries {
let key = s.func_key(Some(&root_str));
local_gs.insert(key, s);
}
}
Err(e) => {
tracing::warn!("pass 1: {}: {e}", path.display());
}
}
} else {
tracing::warn!("pass 1: cannot read {}", path.display());
}
pb.inc(1);
local_gs
})
.reduce(GlobalSummaries::new, |mut a, b| {
a.merge(b);
a
});
pb.finish_and_clear();
tracing::info!("pass 1 complete");
gs
};
// ── Pass 2: full analysis with crossfile context ────────────────────
// ── Pass 2: re-run with cross-file global summaries ──────────────────
let mut diags: Vec<Diag> = {
let _span = tracing::info_span!("pass2_analysis", files = all_paths.len()).entered();
let pb = make_progress_bar(
all_paths.len() as u64,
"Pass 2: Running analysis",
show_progress,
);
all_paths
let result: Vec<Diag> = all_paths
.par_iter()
.map(|path| run_rules_on_file(path, cfg, global_summaries.as_ref(), Some(root)))
.try_reduce(Vec::new, |mut a, mut b| {
a.append(&mut b);
Ok(a)
})?
.flat_map_iter(|path| {
let result = match run_rules_on_file(path, cfg, Some(&global_summaries), Some(root))
{
Ok(d) => d,
Err(e) => {
tracing::warn!("pass 2: {}: {e}", path.display());
vec![]
}
};
pb.inc(1);
result
})
.collect();
pb.finish_and_clear();
result
};
tracing::info!(diags = diags.len(), "pass 2 complete");
@ -187,6 +315,7 @@ pub fn scan_with_index_parallel(
project: &str,
pool: Arc<Pool<SqliteConnectionManager>>,
cfg: &Config,
show_progress: bool,
) -> NyxResult<Vec<Diag>> {
let files = {
let idx = Indexer::from_pool(project, &pool)?;
@ -199,39 +328,37 @@ pub fn scan_with_index_parallel(
// ── Pass 1: ensure summaries are uptodate ──────────────────────────
if needs_taint {
let _span = tracing::info_span!("pass1_indexed", files = files.len()).entered();
let pb = make_progress_bar(
files.len() as u64,
"Pass 1: Extracting summaries",
show_progress,
);
files.par_iter().for_each_init(
|| Indexer::from_pool(project, &pool).expect("db pool"),
|idx, path| {
let needs_scan = idx.should_scan(path).unwrap_or(true);
if !needs_scan {
return; // summaries in DB are still valid
}
// Read once, hash once, extract summaries from bytes.
let bytes = match std::fs::read(path) {
Ok(b) => b,
Err(e) => {
tracing::warn!("pass 1: cannot read {}: {e}", path.display());
return;
}
};
let hash = {
let mut h = blake3::Hasher::new();
h.update(&bytes);
h.finalize().as_bytes().to_vec()
};
match extract_summaries_from_bytes(&bytes, path, cfg) {
Ok(sums) => {
idx.replace_summaries_for_file(path, &hash, &sums).ok();
}
Err(e) => {
tracing::warn!("pass 1: {}: {e}", path.display());
}
// Read once, hash once — use the hash for the change check
// to avoid a second file read inside should_scan.
if let Ok(bytes) = std::fs::read(path) {
let hash = Indexer::digest_bytes(&bytes);
let needs_scan = idx.should_scan_with_hash(path, &hash).unwrap_or(true);
if needs_scan {
match extract_summaries_from_bytes(&bytes, path, cfg) {
Ok(sums) => {
idx.replace_summaries_for_file(path, &hash, &sums).ok();
}
Err(e) => {
tracing::warn!("pass 1: {}: {e}", path.display());
}
}
}
} else {
tracing::warn!("pass 1: cannot read {}", path.display());
}
pb.inc(1);
},
);
pb.finish_and_clear();
}
// ── Load global summaries ────────────────────────────────────────────
@ -247,26 +374,47 @@ pub fn scan_with_index_parallel(
// ── Pass 2: full analysis ────────────────────────────────────────────
let _span = tracing::info_span!("pass2_indexed").entered();
let pb2 = make_progress_bar(
files.len() as u64,
"Pass 2: Running analysis",
show_progress,
);
let diag_map: DashMap<String, Vec<Diag>> = DashMap::new();
files.into_par_iter().for_each_init(
|| Indexer::from_pool(project, &pool).expect("db pool"),
|idx, path| {
// Read file once for both change-detection and analysis.
let bytes_opt = std::fs::read(&path).ok();
let hash = bytes_opt.as_ref().map(|b| Indexer::digest_bytes(b));
// In pass 2 we always re-analyse when taint is enabled because
// global summaries may have changed even if this file didn't.
// For AST-only mode, we can still use the cached issues.
let needs_scan = if needs_taint {
true // conservative: always re-analyse in taint mode
} else {
idx.should_scan(&path).unwrap_or(true)
match (&hash, &bytes_opt) {
(Some(h), _) => idx.should_scan_with_hash(&path, h).unwrap_or(true),
_ => true,
}
};
let mut diags = if needs_scan {
let d = run_rules_on_file(&path, cfg, global_summaries.as_ref(), None)
.unwrap_or_default();
let d = match &bytes_opt {
Some(bytes) => {
run_rules_on_bytes(bytes, &path, cfg, global_summaries.as_ref(), None)
.unwrap_or_default()
}
None => run_rules_on_file(&path, cfg, global_summaries.as_ref(), None)
.unwrap_or_default(),
};
// Persist issues + update file record
let file_id = idx.upsert_file(&path).unwrap_or_default();
// Persist issues + update file record (use pre-computed hash)
let file_id = match &hash {
Some(h) => idx.upsert_file_with_hash(&path, h).unwrap_or_default(),
None => idx.upsert_file(&path).unwrap_or_default(),
};
idx.replace_issues(
file_id,
d.iter().map(|d| IssueRow {
@ -298,8 +446,10 @@ pub fn scan_with_index_parallel(
.or_default()
.append(&mut diags);
}
pb2.inc(1);
},
);
pb2.finish_and_clear();
let mut diags: Vec<Diag> = diag_map.into_iter().flat_map(|(_, v)| v).collect();
@ -323,7 +473,8 @@ fn scan_with_index_parallel_uses_existing_index_without_rescanning() {
std::fs::write(project_dir.join("foo.txt"), "abc").unwrap();
let (project_name, db_path) = get_project_info(&project_dir, td.path()).unwrap();
crate::commands::index::build_index(&project_name, &project_dir, &db_path, &cfg).unwrap();
crate::commands::index::build_index(&project_name, &project_dir, &db_path, &cfg, false)
.unwrap();
let pool = Indexer::init(&db_path).unwrap();
@ -336,7 +487,7 @@ fn scan_with_index_parallel_uses_existing_index_without_rescanning() {
1
);
let diags = scan_with_index_parallel(&project_name, Arc::clone(&pool), &cfg)
let diags = scan_with_index_parallel(&project_name, Arc::clone(&pool), &cfg, false)
.expect("scan should succeed");
assert!(diags.is_empty());

View file

@ -68,9 +68,13 @@ pub mod index {
impl Indexer {
pub fn init(database_path: &Path) -> NyxResult<Arc<Pool<SqliteConnectionManager>>> {
let _span = tracing::info_span!("db_init", path = %database_path.display()).entered();
// NO_MUTEX is safe because r2d2 ensures each pooled connection
// is only ever used by one thread at a time. Combined with WAL
// mode this allows concurrent readers + a single writer without
// the global serialization that FULL_MUTEX causes.
let flags = OpenFlags::SQLITE_OPEN_READ_WRITE
| OpenFlags::SQLITE_OPEN_CREATE
| OpenFlags::SQLITE_OPEN_FULL_MUTEX;
| OpenFlags::SQLITE_OPEN_NO_MUTEX;
let manager = SqliteConnectionManager::file(database_path).with_flags(flags);
let pool = Arc::new(Pool::new(manager)?);
@ -132,10 +136,13 @@ pub mod index {
}
/// Return true when the file *content* or *mtime* changed since the last scan.
///
/// Short-circuits on mtime: if the stored mtime matches the
/// filesystem mtime, the file is assumed unchanged (skip hash).
#[allow(dead_code)] // used in tests and by should_scan_with_hash callers may fall back
pub fn should_scan(&self, path: &Path) -> NyxResult<bool> {
let meta = fs::metadata(path)?;
let mtime = meta.modified()?.duration_since(UNIX_EPOCH)?.as_secs() as i64;
let digest = Self::digest_file(path)?;
let row: Option<(Vec<u8>, i64)> = self
.conn
@ -147,18 +154,56 @@ pub mod index {
.optional()?;
Ok(match row {
Some((stored_hash, stored_mtime)) => stored_hash != digest || stored_mtime != mtime,
Some((stored_hash, stored_mtime)) => {
if stored_mtime != mtime {
// mtime changed — must re-scan
true
} else {
// mtime matches — compare hash only if cheap
// (the caller already read the file and can use
// should_scan_with_hash instead for full accuracy)
let digest = Self::digest_file(path)?;
stored_hash != digest
}
}
None => true,
})
}
/// Like [`should_scan`] but accepts a pre-computed hash to avoid
/// redundant file reads.
pub fn should_scan_with_hash(&self, path: &Path, hash: &[u8]) -> NyxResult<bool> {
let row: Option<Vec<u8>> = self
.conn
.query_row(
"SELECT hash FROM files WHERE project = ?1 AND path = ?2",
params![self.project, path.to_string_lossy()],
|r| r.get(0),
)
.optional()?;
Ok(match row {
Some(stored_hash) => stored_hash != hash,
None => true,
})
}
/// Insert or update the `files` row and return its id.
pub fn upsert_file(&self, path: &Path) -> NyxResult<i64> {
let bytes = fs::read(path)?;
let hash = Self::digest_bytes(&bytes);
self.upsert_file_with_hash(path, &hash)
}
/// Insert or update the `files` row using a pre-computed hash.
/// Avoids redundant file reads when the caller already has the hash.
pub fn upsert_file_with_hash(&self, path: &Path, hash: &[u8]) -> NyxResult<i64> {
let meta = fs::metadata(path)?;
let mtime = meta.modified()?.duration_since(UNIX_EPOCH)?.as_secs() as i64;
let scanned_at = SystemTime::now().duration_since(UNIX_EPOCH)?.as_secs() as i64;
let digest = Self::digest_file(path)?;
let path_str = path.to_string_lossy();
// Use a single statement: upsert then query the id.
self.c().execute(
"INSERT INTO files (project, path, hash, mtime, scanned_at)
VALUES (?1, ?2, ?3, ?4, ?5)
@ -166,18 +211,12 @@ pub mod index {
SET hash = excluded.hash,
mtime = excluded.mtime,
scanned_at = excluded.scanned_at",
params![
self.project,
path.to_string_lossy(),
digest,
mtime,
scanned_at
],
params![self.project, path_str, hash, mtime, scanned_at],
)?;
let id: i64 = self.c().query_row(
"SELECT id FROM files WHERE project = ?1 AND path = ?2",
params![self.project, path.to_string_lossy()],
params![self.project, path_str],
|r| r.get(0),
)?;
Ok(id)
@ -287,24 +326,38 @@ pub mod index {
}
/// Load every function summary for this project.
///
/// Reads all JSON strings from SQLite in one pass, then
/// deserializes them in parallel with rayon for large result sets.
pub fn load_all_summaries(&self) -> NyxResult<Vec<crate::summary::FuncSummary>> {
let mut stmt = self
.c()
.prepare("SELECT summary FROM function_summaries WHERE project = ?1")?;
let iter = stmt.query_map([&self.project], |row| {
let json: String = row.get(0)?;
Ok(json)
})?;
let jsons: Vec<String> = stmt
.query_map([&self.project], |row| row.get::<_, String>(0))?
.filter_map(Result::ok)
.collect();
let mut out = Vec::new();
for row in iter {
let json = row?;
let s: crate::summary::FuncSummary = serde_json::from_str(&json)
.map_err(|e| rusqlite::Error::ToSqlConversionFailure(Box::new(e)))?;
out.push(s);
// Parallel JSON deserialization for large sets
if jsons.len() > 256 {
use rayon::prelude::*;
let results: Vec<_> = jsons
.par_iter()
.filter_map(|json| {
serde_json::from_str::<crate::summary::FuncSummary>(json).ok()
})
.collect();
Ok(results)
} else {
let mut out = Vec::with_capacity(jsons.len());
for json in &jsons {
if let Ok(s) = serde_json::from_str::<crate::summary::FuncSummary>(json) {
out.push(s);
}
}
Ok(out)
}
Ok(out)
}
/// gets files from the database
@ -351,12 +404,20 @@ pub mod index {
// -------------------------------------------------------------------------
// Helpers
// -------------------------------------------------------------------------
#[allow(dead_code)] // used by should_scan() and tests
fn digest_file(path: &Path) -> NyxResult<Vec<u8>> {
let mut hasher = blake3::Hasher::new();
let mut file = fs::File::open(path)?;
std::io::copy(&mut file, &mut hasher)?;
Ok(hasher.finalize().as_bytes().to_vec())
}
/// Hash already-read bytes without re-reading from disk.
pub fn digest_bytes(bytes: &[u8]) -> Vec<u8> {
let mut hasher = blake3::Hasher::new();
hasher.update(bytes);
hasher.finalize().as_bytes().to_vec()
}
}
}

View file

@ -24,9 +24,13 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
},
LabelRule {
matchers: &["printf", "fprintf", "sprintf", "strcpy", "strcat"],
matchers: &["sprintf", "strcpy", "strcat"],
label: DataLabel::Sink(Cap::HTML_ESCAPE),
},
LabelRule {
matchers: &["printf", "fprintf"],
label: DataLabel::Sink(Cap::FMT_STRING),
},
];
pub static KINDS: Map<&'static str, Kind> = phf_map! {

View file

@ -22,16 +22,13 @@ pub static RULES: &[LabelRule] = &[
label: DataLabel::Sink(Cap::SHELL_ESCAPE),
},
LabelRule {
matchers: &[
"printf",
"fprintf",
"sprintf",
"strcpy",
"strcat",
"std::cout",
],
matchers: &["sprintf", "strcpy", "strcat"],
label: DataLabel::Sink(Cap::HTML_ESCAPE),
},
LabelRule {
matchers: &["printf", "fprintf"],
label: DataLabel::Sink(Cap::FMT_STRING),
},
];
pub static KINDS: Map<&'static str, Kind> = phf_map! {

View file

@ -38,6 +38,14 @@ pub static RULES: &[LabelRule] = &[
matchers: &["innerHTML"],
label: DataLabel::Sink(Cap::HTML_ESCAPE),
},
LabelRule {
matchers: &[
"location.href",
"window.location.href",
"document.location.href",
],
label: DataLabel::Sink(Cap::URL_ENCODE),
},
LabelRule {
matchers: &[
"child_process.exec",
@ -56,6 +64,7 @@ pub static KINDS: Map<&'static str, Kind> = phf_map! {
"for_in_statement" => Kind::For,
"return_statement" => Kind::Return,
"throw_statement" => Kind::Return,
"break_statement" => Kind::Break,
"continue_statement" => Kind::Continue,

View file

@ -31,7 +31,7 @@ bitflags! {
const URL_ENCODE = 0b0000_1000;
const JSON_PARSE = 0b0001_0000;
const FILE_IO = 0b0010_0000;
// todo: add more if needed
const FMT_STRING = 0b0100_0000;
}
}
@ -195,6 +195,147 @@ pub fn lookup(lang: &str, raw: &str) -> Kind {
.unwrap_or(Kind::Other)
}
/// The kind of taint source, used to refine finding severity.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SourceKind {
/// Direct user input (request params, argv, stdin, form data)
UserInput,
/// Environment variables and configuration
EnvironmentConfig,
/// File system reads
FileSystem,
/// Database query results
Database,
/// Could not determine — treat conservatively
Unknown,
}
/// Infer the source kind from capabilities and callee name.
pub fn infer_source_kind(caps: Cap, callee: &str) -> SourceKind {
let cl = callee.to_ascii_lowercase();
// User input patterns
if cl.contains("argv")
|| cl.contains("stdin")
|| cl.contains("request")
|| cl.contains("form")
|| cl.contains("query")
|| cl.contains("params")
|| cl.contains("input")
|| cl.contains("body")
|| cl.contains("header")
|| cl.contains("cookie")
{
return SourceKind::UserInput;
}
// Environment / config patterns
if cl.contains("env")
|| cl.contains("getenv")
|| cl.contains("environ")
|| cl.contains("config")
{
return SourceKind::EnvironmentConfig;
}
// File system patterns
if cl.contains("read") || cl.contains("fopen") || cl.contains("open") {
// Distinguish from db reads — file reads typically have FILE_IO cap
if caps.contains(Cap::FILE_IO) {
return SourceKind::FileSystem;
}
}
// Database patterns
if cl.contains("fetchone")
|| cl.contains("fetchall")
|| cl.contains("fetch_row")
|| cl.contains("query")
|| cl.contains("execute")
{
// Queries that read back from db
return SourceKind::Database;
}
SourceKind::Unknown
}
/// Map a source kind to its appropriate severity level.
pub fn severity_for_source_kind(kind: SourceKind) -> crate::patterns::Severity {
match kind {
SourceKind::UserInput => crate::patterns::Severity::High,
SourceKind::EnvironmentConfig => crate::patterns::Severity::High,
SourceKind::FileSystem => crate::patterns::Severity::Medium,
SourceKind::Database => crate::patterns::Severity::Medium,
SourceKind::Unknown => crate::patterns::Severity::High,
}
}
/// A runtime (config-derived) label rule with owned matchers.
#[derive(Debug, Clone)]
pub struct RuntimeLabelRule {
pub matchers: Vec<String>,
pub label: DataLabel,
}
/// Parse a capability name string into a `Cap` bitflag.
pub fn parse_cap(s: &str) -> Option<Cap> {
match s.to_ascii_lowercase().as_str() {
"env_var" => Some(Cap::ENV_VAR),
"html_escape" => Some(Cap::HTML_ESCAPE),
"shell_escape" => Some(Cap::SHELL_ESCAPE),
"url_encode" => Some(Cap::URL_ENCODE),
"json_parse" => Some(Cap::JSON_PARSE),
"file_io" => Some(Cap::FILE_IO),
"fmt_string" => Some(Cap::FMT_STRING),
"all" => Some(Cap::all()),
_ => None,
}
}
/// Pre-built analysis rules for a specific language, derived from config.
/// Built once per file and threaded through the pipeline.
#[derive(Debug, Clone, Default)]
pub struct LangAnalysisRules {
pub extra_labels: Vec<RuntimeLabelRule>,
pub terminators: Vec<String>,
pub event_handlers: Vec<String>,
}
/// Build `LangAnalysisRules` from a `Config` for a given language slug.
pub fn build_lang_rules(
config: &crate::utils::config::Config,
lang_slug: &str,
) -> LangAnalysisRules {
let Some(lang_cfg) = config.analysis.languages.get(lang_slug) else {
return LangAnalysisRules::default();
};
let extra_labels = lang_cfg
.rules
.iter()
.filter_map(|r| {
let cap = parse_cap(&r.cap)?;
let label = match r.kind.as_str() {
"source" => DataLabel::Source(cap),
"sanitizer" => DataLabel::Sanitizer(cap),
"sink" => DataLabel::Sink(cap),
_ => return None,
};
Some(RuntimeLabelRule {
matchers: r.matchers.clone(),
label,
})
})
.collect();
LangAnalysisRules {
extra_labels,
terminators: lang_cfg.terminators.clone(),
event_handlers: lang_cfg.event_handlers.clone(),
}
}
/// Case-insensitive suffix check (ASCII).
#[inline]
fn ends_with_ignore_case(haystack: &[u8], needle: &[u8]) -> bool {
@ -223,29 +364,58 @@ fn starts_with_ignore_case(haystack: &[u8], needle: &[u8]) -> bool {
/// Try to classify a piece of syntax text.
/// `lang` is the canonicalised language key ("rust", "javascript", ...).
///
/// If `extra` runtime rules are provided, they are checked **first** (config
/// takes priority over built-in rules).
///
/// **Two-pass matching** -- exact / suffix matches are checked across *all*
/// rules before any prefix (`foo_`) match is attempted. This prevents a
/// greedy prefix like `sanitize_` from shadowing a more specific exact
/// match like `sanitize_shell`.
pub fn classify(lang: &str, text: &str) -> Option<DataLabel> {
// Lang slugs are already lowercase; try direct lookup first to avoid
// allocating a lowercased copy.
pub fn classify(lang: &str, text: &str, extra: Option<&[RuntimeLabelRule]>) -> Option<DataLabel> {
let head = text.split(['(', '<']).next().unwrap_or("");
let trimmed = head.trim().as_bytes();
// ── Check runtime (config) rules first — they take priority ──────
if let Some(extras) = extra {
// Pass 1: exact / suffix
for rule in extras {
for raw in &rule.matchers {
let m = raw.as_bytes();
if m.last() == Some(&b'_') {
continue;
}
if ends_with_ignore_case(trimmed, m) {
let start = trimmed.len() - m.len();
let ok = start == 0 || matches!(trimmed[start - 1], b'.' | b':');
if ok {
return Some(rule.label);
}
}
}
}
// Pass 2: prefix
for rule in extras {
for raw in &rule.matchers {
let m = raw.as_bytes();
if m.last() == Some(&b'_') && starts_with_ignore_case(trimmed, m) {
return Some(rule.label);
}
}
}
}
// ── Built-in static rules ────────────────────────────────────────
let rules = REGISTRY.get(lang).or_else(|| {
let key = lang.to_ascii_lowercase();
REGISTRY.get(key.as_str())
})?;
let head = text.split(['(', '<']).next().unwrap_or("");
let trimmed = head.trim().as_bytes();
// Pass 1: exact / suffix matches (high confidence)
// Matchers are already lowercase &'static str, so we compare with
// case-insensitive byte helpers — zero heap allocations.
for rule in *rules {
for raw in rule.matchers {
let m = raw.as_bytes();
if m.last() == Some(&b'_') {
continue; // skip prefix matchers in pass 1
continue;
}
if ends_with_ignore_case(trimmed, m) {
let start = trimmed.len() - m.len();
@ -269,3 +439,72 @@ pub fn classify(lang: &str, text: &str) -> Option<DataLabel> {
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn classify_none_extra_unchanged() {
// Built-in rule: innerHTML → Sink(HTML_ESCAPE)
let result = classify("javascript", "innerHTML", None);
assert_eq!(result, Some(DataLabel::Sink(Cap::HTML_ESCAPE)));
// Non-existent should still be None
let result = classify("javascript", "myCustomFunc", None);
assert_eq!(result, None);
}
#[test]
fn classify_extra_rules_take_priority() {
let extras = vec![RuntimeLabelRule {
matchers: vec!["escapeHtml".into()],
label: DataLabel::Sanitizer(Cap::HTML_ESCAPE),
}];
let result = classify("javascript", "escapeHtml", Some(&extras));
assert_eq!(result, Some(DataLabel::Sanitizer(Cap::HTML_ESCAPE)));
// Built-in rules still work
let result = classify("javascript", "innerHTML", Some(&extras));
assert_eq!(result, Some(DataLabel::Sink(Cap::HTML_ESCAPE)));
}
#[test]
fn classify_extra_overrides_builtin() {
// Override innerHTML to be a sanitizer (contrived but tests priority)
let extras = vec![RuntimeLabelRule {
matchers: vec!["innerHTML".into()],
label: DataLabel::Sanitizer(Cap::HTML_ESCAPE),
}];
let result = classify("javascript", "innerHTML", Some(&extras));
assert_eq!(result, Some(DataLabel::Sanitizer(Cap::HTML_ESCAPE)));
}
#[test]
fn classify_location_href_is_sink() {
let result = classify("javascript", "location.href", None);
assert_eq!(result, Some(DataLabel::Sink(Cap::URL_ENCODE)));
}
#[test]
fn classify_bare_href_is_none() {
// Bare "href" should NOT be a sink — only "location.href" and variants
let result = classify("javascript", "href", None);
assert_eq!(result, None);
}
#[test]
fn parse_cap_works() {
assert_eq!(parse_cap("html_escape"), Some(Cap::HTML_ESCAPE));
assert_eq!(parse_cap("shell_escape"), Some(Cap::SHELL_ESCAPE));
assert_eq!(parse_cap("url_encode"), Some(Cap::URL_ENCODE));
assert_eq!(parse_cap("json_parse"), Some(Cap::JSON_PARSE));
assert_eq!(parse_cap("env_var"), Some(Cap::ENV_VAR));
assert_eq!(parse_cap("file_io"), Some(Cap::FILE_IO));
assert_eq!(parse_cap("all"), Some(Cap::all()));
assert_eq!(parse_cap("ALL"), Some(Cap::all()));
assert_eq!(parse_cap("invalid"), None);
}
}

View file

@ -22,6 +22,19 @@ pub static RULES: &[LabelRule] = &[
matchers: &["sys.argv"],
label: DataLabel::Source(Cap::all()),
},
LabelRule {
matchers: &["open"],
label: DataLabel::Source(Cap::all()),
},
LabelRule {
matchers: &[
"argparse.parse_args",
"urllib.request.urlopen",
"requests.get",
"requests.post",
],
label: DataLabel::Source(Cap::all()),
},
// ───────── Sanitizers ──────────
LabelRule {
matchers: &["html.escape"],

View file

@ -11,6 +11,7 @@ pub mod database;
pub mod errors;
pub mod interop;
pub mod labels;
pub mod output;
pub mod patterns;
pub mod summary;
pub mod symbol;
@ -25,5 +26,5 @@ use utils::config::Config;
/// Run a two-pass scan without index (filesystem only).
/// This is the primary entry point for integration tests.
pub fn scan_no_index(root: &Path, cfg: &Config) -> NyxResult<Vec<commands::scan::Diag>> {
commands::scan::scan_filesystem(root, cfg)
commands::scan::scan_filesystem(root, cfg, false)
}

View file

@ -7,6 +7,7 @@ mod database;
mod errors;
mod interop;
mod labels;
mod output;
mod patterns;
mod summary;
mod symbol;
@ -65,19 +66,28 @@ fn main() -> NyxResult<()> {
let database_dir = proj_dirs.data_local_dir();
fs::create_dir_all(database_dir)?;
let mut config = Config::load(config_dir)?;
let (mut config, config_note) = Config::load(config_dir)?;
rayon::ThreadPoolBuilder::new()
.stack_size(config.performance.rayon_thread_stack_size)
.build_global()
.expect("set rayon stack size");
commands::handle_command(cli.command, database_dir, &mut config)?;
let quiet = config.output.quiet || cli.command.is_structured_output();
println!(
"{} in {:.3}s.",
style("Finished").green().bold(),
now.elapsed().as_secs_f32()
);
// Print config note before scanning (human-readable mode only).
if let Some(note) = config_note.filter(|_| !quiet) {
eprint!("{note}");
}
commands::handle_command(cli.command, database_dir, config_dir, &mut config)?;
if !quiet {
println!(
"{} in {:.3}s.",
style("Finished").green().bold(),
now.elapsed().as_secs_f32()
);
}
Ok(())
}

152
src/output.rs Normal file
View file

@ -0,0 +1,152 @@
use crate::commands::scan::Diag;
use crate::patterns::{self, Severity};
use once_cell::sync::Lazy;
use serde_json::{Value, json};
use std::collections::HashMap;
use std::path::Path;
/// Lazily-built global map: pattern ID → description from all language registries.
static PATTERN_DESCRIPTIONS: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {
let mut map = HashMap::new();
for lang in &[
"rust",
"c",
"cpp",
"java",
"go",
"php",
"python",
"ruby",
"javascript",
"typescript",
] {
for p in patterns::load(lang) {
map.entry(p.id).or_insert(p.description);
}
}
map
});
/// CFG rule descriptions for rules not in the pattern registry.
fn cfg_rule_description(id: &str) -> Option<&'static str> {
match id {
"cfg-unguarded-sink" => Some("Dangerous sink reachable without prior guard or sanitizer"),
"cfg-unreachable-sink" => Some("Sink in unreachable code"),
"cfg-auth-gap" => Some("Entry-point handler reaches sink without authentication check"),
"cfg-error-fallthrough" => {
Some("Error check does not terminate; dangerous call follows on error path")
}
"cfg-resource-leak" => Some("Resource acquired but not released on all exit paths"),
"cfg-lock-not-released" => Some("Lock acquired but not released on all exit paths"),
_ => None,
}
}
/// Look up a human-readable description for any rule ID.
fn rule_description(id: &str) -> &str {
// Strip taint-specific suffix for lookup (e.g. "taint-unsanitised-flow:foo.rs:42" → base)
let base_id = if id.starts_with("taint-") {
"taint-unsanitised-flow"
} else {
id
};
if let Some(desc) = PATTERN_DESCRIPTIONS.get(base_id) {
return desc;
}
if let Some(desc) = cfg_rule_description(base_id) {
return desc;
}
if base_id == "taint-unsanitised-flow" {
return "Unsanitised data flows from source to sink";
}
id
}
fn severity_to_level(sev: Severity) -> &'static str {
match sev {
Severity::High => "error",
Severity::Medium => "warning",
Severity::Low => "note",
}
}
/// Build a SARIF 2.1.0 JSON value from a list of diagnostics.
pub fn build_sarif(diags: &[Diag], scan_root: &Path) -> Value {
// Deduplicate rule IDs and build rules array.
let mut rule_ids: Vec<String> = Vec::new();
let mut rule_index_map: HashMap<String, usize> = HashMap::new();
for d in diags {
let base = if d.id.starts_with("taint-") {
"taint-unsanitised-flow".to_string()
} else {
d.id.clone()
};
if !rule_index_map.contains_key(&base) {
let idx = rule_ids.len();
rule_index_map.insert(base.clone(), idx);
rule_ids.push(base);
}
}
let rules: Vec<Value> = rule_ids
.iter()
.map(|id| {
json!({
"id": id,
"shortDescription": { "text": rule_description(id) },
})
})
.collect();
let results: Vec<Value> = diags
.iter()
.map(|d| {
let base = if d.id.starts_with("taint-") {
"taint-unsanitised-flow"
} else {
&d.id
};
let rule_index = rule_index_map[base];
// Make path relative to scan root if possible
let uri = Path::new(&d.path)
.strip_prefix(scan_root)
.map(|p| p.to_string_lossy().to_string())
.unwrap_or_else(|_| d.path.clone());
json!({
"ruleId": base,
"ruleIndex": rule_index,
"level": severity_to_level(d.severity),
"message": { "text": rule_description(base) },
"locations": [{
"physicalLocation": {
"artifactLocation": { "uri": uri },
"region": {
"startLine": d.line,
"startColumn": d.col
}
}
}]
})
})
.collect();
json!({
"$schema": "https://raw.githubusercontent.com/oasis-tcs/sarif-spec/main/sarif-2.1/schema/sarif-schema-2.1.0.json",
"version": "2.1.0",
"runs": [{
"tool": {
"driver": {
"name": "nyx",
"version": env!("CARGO_PKG_VERSION"),
"informationUri": env!("CARGO_PKG_HOMEPAGE"),
"rules": rules
}
},
"results": results
}]
})
}

View file

@ -54,9 +54,10 @@ pub const PATTERNS: &[Pattern] = &[
description: "Assignment to window.location / location.href",
query: "(assignment_expression
left: (member_expression
object: (identifier)? @obj
object: (identifier) @obj
(#match? @obj \"^(window|location|document|self|top|parent|frames)$\")
property: (property_identifier) @prop
(#match? @prop \"location|href\"))) @vuln",
(#match? @prop \"^(location|href)$\"))) @vuln",
severity: Severity::High,
},
Pattern {
@ -77,7 +78,7 @@ pub const PATTERNS: &[Pattern] = &[
left: (member_expression
property: (property_identifier) @prop
(#eq? @prop \"__proto__\"))) @vuln",
severity: Severity::High,
severity: Severity::Low,
},
Pattern {
id: "weak_hash_md5",

View file

@ -23,14 +23,33 @@ pub enum Severity {
Low,
}
impl Severity {
/// Bracketed, colored, fixed-width tag for aligned console output.
///
/// Returns e.g. `"[HIGH] "` or `"[MEDIUM]"` — always 8 visible characters
/// so the column after the tag lines up regardless of severity.
pub fn colored_tag(self) -> String {
// Visible widths: "[HIGH]" = 6, "[MEDIUM]" = 8, "[LOW]" = 5.
// Pad the *whole* tag to 8 visible chars (the longest, "[MEDIUM]").
let (label, styled_fn): (&str, fn(&str) -> String) = match self {
Severity::High => ("HIGH", |s| style(s).red().bold().to_string()),
Severity::Medium => ("MEDIUM", |s| style(s).yellow().bold().to_string()),
Severity::Low => ("LOW", |s| style(s).cyan().bold().to_string()),
};
let bracket_len = label.len() + 2; // "[" + label + "]"
let pad = 8usize.saturating_sub(bracket_len);
format!("[{}]{:pad$}", styled_fn(label), "", pad = pad)
}
}
impl fmt::Display for Severity {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let s = match *self {
let styled = match *self {
Severity::High => style("HIGH").red().bold().to_string(),
Severity::Medium => style("MEDIUM").yellow().bold().to_string(),
Severity::Low => style("LOW").cyan().bold().to_string(),
};
f.write_str(&s)
f.write_str(&styled)
}
}

View file

@ -209,6 +209,13 @@ impl GlobalSummaries {
.unwrap_or_default()
}
/// Merge another `GlobalSummaries` into this one (for parallel fold/reduce).
pub fn merge(&mut self, other: GlobalSummaries) {
for (key, summary) in other.by_key {
self.insert(key, summary);
}
}
#[allow(dead_code)]
pub fn is_empty(&self) -> bool {
self.by_key.is_empty()

View file

@ -1,6 +1,6 @@
use crate::cfg::{Cfg, FuncSummaries, NodeInfo, StmtKind};
use crate::interop::InteropEdge;
use crate::labels::{Cap, DataLabel};
use crate::labels::{Cap, DataLabel, SourceKind};
use crate::summary::GlobalSummaries;
use crate::symbol::Lang;
use petgraph::graph::NodeIndex;
@ -18,18 +18,28 @@ pub struct Finding {
/// The full path from source to sink through the CFG.
#[allow(dead_code)] // used for future detailed diagnostics / path display
pub path: Vec<NodeIndex>,
/// The kind of source that originated the taint.
pub source_kind: SourceKind,
}
/// Order-independent hash of a taint map.
///
/// Uses XOR of per-entry hashes so the result is the same regardless of
/// iteration order — no allocation or sorting required.
fn taint_hash(taint: &HashMap<String, Cap>) -> u64 {
let mut v: Vec<_> = taint.iter().collect();
v.sort_by_key(|(k, _)| k.as_str());
let mut hasher = blake3::Hasher::new();
for (k, bits) in v {
hasher.update(k.as_bytes());
hasher.update(&bits.bits().to_le_bytes());
let mut h: u64 = 0;
for (k, bits) in taint {
// Per-entry hash: FNV-1a-style mixing of key bytes + cap bits.
let mut entry_h: u64 = 0xcbf2_9ce4_8422_2325; // FNV offset basis
for b in k.as_bytes() {
entry_h ^= *b as u64;
entry_h = entry_h.wrapping_mul(0x0100_0000_01b3); // FNV prime
}
entry_h ^= bits.bits() as u64;
entry_h = entry_h.wrapping_mul(0x0100_0000_01b3);
h ^= entry_h;
}
let digest = hasher.finalize();
u64::from_le_bytes(digest.as_bytes()[0..8].try_into().unwrap())
h
}
/// Resolved summary for a callee — a uniform view regardless of whether the
@ -140,18 +150,21 @@ fn resolve_callee(
None
}
/// Apply taint transfer for a single node, mutating `out` in place.
///
/// Callers should clone the taint map before calling if they need
/// the original state preserved.
fn apply_taint(
node: &NodeInfo,
taint: &HashMap<String, Cap>,
out: &mut HashMap<String, Cap>,
local_summaries: &FuncSummaries,
global_summaries: Option<&GlobalSummaries>,
caller_lang: Lang,
caller_namespace: &str,
interop_edges: &[InteropEdge],
) -> HashMap<String, Cap> {
) {
debug!(target: "taint", "Applying taint to node: {:?}", node);
debug!(target: "taint", "Taint: {:?}", taint);
let mut out = taint.clone();
debug!(target: "taint", "Taint: {:?}", out);
let caller_func = node.enclosing_func.as_deref().unwrap_or("");
@ -236,7 +249,7 @@ fn apply_taint(
// ── Sink behaviour: handled in the main analysis loop
// (checked via node.label or resolved summary) ──
return out;
return;
}
// Unresolved call — fall through to default gen/kill below
@ -264,8 +277,6 @@ fn apply_taint(
out.insert(d.clone(), combined);
}
}
out
}
/// Run taint analysis on a single file's CFG.
@ -309,9 +320,10 @@ pub fn analyse_file(
while let Some(Item { node, taint }) = q.pop_front() {
let caller_func = cfg[node].enclosing_func.as_deref().unwrap_or("");
let out = apply_taint(
let mut out = taint.clone();
apply_taint(
&cfg[node],
&taint,
&mut out,
local_summaries,
global_summaries,
caller_lang,
@ -398,26 +410,44 @@ pub fn analyse_file(
}
path.reverse();
// Infer the source kind from the source node's label and callee
let source_kind = match cfg[source_node].label {
Some(DataLabel::Source(caps)) => {
let callee = cfg[source_node].callee.as_deref().unwrap_or("");
crate::labels::infer_source_kind(caps, callee)
}
_ => SourceKind::Unknown,
};
findings.push(Finding {
sink: sink_node,
source: source_node,
path,
source_kind,
});
}
}
// enqueue successors
for succ in cfg.neighbors(node) {
let h = taint_hash(&out);
let key = (succ, h);
// enqueue successors — cache hashes to avoid recomputation
let out_h = taint_hash(&out);
let in_h = taint_hash(&taint);
let succs: Vec<_> = cfg.neighbors(node).collect();
for (i, succ) in succs.iter().enumerate() {
let key = (*succ, out_h);
if !seen.contains(&key) {
seen.insert(key);
pred.insert(key, (node, taint_hash(&taint)));
let item = Item {
node: succ,
taint: out.clone(),
pred.insert(key, (node, in_h));
// Move the map into the last successor to avoid a clone
let taint_for_succ = if i + 1 == succs.len() {
std::mem::take(&mut out)
} else {
out.clone()
};
q.push_back(item);
q.push_back(Item {
node: *succ,
taint: taint_for_succ,
});
}
}
}

View file

@ -20,7 +20,7 @@ fn env_to_arg_is_flagged() {
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs");
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
assert_eq!(findings.len(), 1); // exactly one unsanitised Source→Sink
@ -49,7 +49,7 @@ fn taint_through_if_else() {
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs");
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
// exactly one path (via the True branch) should be flagged
@ -76,7 +76,7 @@ fn taint_through_while_loop() {
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs");
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
assert_eq!(findings.len(), 1);
}
@ -102,7 +102,7 @@ fn taint_killed_by_matching_sanitizer() {
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs");
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
assert!(
findings.is_empty(),
@ -131,7 +131,7 @@ fn wrong_sanitizer_preserves_taint() {
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs");
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
assert_eq!(
findings.len(),
@ -160,7 +160,7 @@ fn taint_breaks_out_of_loop() {
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs");
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
assert_eq!(findings.len(), 1);
}
@ -189,7 +189,7 @@ fn test_two_sources_one_sanitised() {
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs");
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
assert_eq!(
findings.len(),
@ -222,7 +222,7 @@ fn test_two_sources_wrong_sanitiser_both_flagged() {
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs");
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
assert_eq!(
findings.len(),
@ -250,7 +250,7 @@ fn test_should_not_panic_on_empty_function() {
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs");
let (cfg, entry, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let findings = analyse_file(&cfg, entry, &summaries, None, Lang::Rust, "test.rs", &[]);
assert!(findings.is_empty());
}
@ -374,7 +374,7 @@ fn parse_rust(src: &[u8]) -> (Cfg, NodeIndex, FuncSummaries) {
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src, None).unwrap();
build_cfg(&tree, src, "rust", "test.rs")
build_cfg(&tree, src, "rust", "test.rs", None)
}
/// Parse Rust source bytes, build CFG, and export cross-file summaries.
@ -1089,7 +1089,7 @@ fn parse_lang(
"ruby" => "test.rb",
_ => "test.txt",
};
build_cfg(&tree, src, slug, ext)
build_cfg(&tree, src, slug, ext, None)
}
#[test]
@ -2206,7 +2206,7 @@ fn return_call_recognized_as_source() {
.set_language(&Language::from(tree_sitter_rust::LANGUAGE))
.unwrap();
let tree = parser.parse(src as &[u8], None).unwrap();
let (_, _, summaries) = build_cfg(&tree, src, "rust", "test.rs");
let (_, _, summaries) = build_cfg(&tree, src, "rust", "test.rs", None);
let exported = export_summaries(&summaries, "test.rs", "rust");
let foo = exported

View file

@ -2,6 +2,7 @@ use crate::errors::NyxResult;
use crate::patterns::Severity;
use console::style;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fs;
use std::path::Path;
use toml;
@ -55,6 +56,11 @@ pub struct ScannerConfig {
/// Whether to scan hidden files or not.
pub scan_hidden_files: bool,
/// Whether to include findings from non-production paths (tests, vendor,
/// benchmarks, etc.) at their original severity. When false (default),
/// findings in these paths are downgraded by one severity tier.
pub include_nonprod: bool,
}
impl Default for ScannerConfig {
fn default() -> Self {
@ -87,6 +93,7 @@ impl Default for ScannerConfig {
one_file_system: false,
follow_symlinks: false,
scan_hidden_files: false,
include_nonprod: false,
}
}
}
@ -103,7 +110,7 @@ pub struct DatabaseConfig {
/// The maximum size of the database, in megabytes. TODO: IMPLEMENT
pub max_db_size_mb: u64,
/// Whether to run a VACUUM on startup or not. TODO: IMPLEMENT
/// Whether to run a VACUUM on startup or not.
pub vacuum_on_startup: bool,
}
impl Default for DatabaseConfig {
@ -120,10 +127,10 @@ impl Default for DatabaseConfig {
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(default)]
pub struct OutputConfig {
/// The default output format. TODO: IMPLEMENT others
/// The default output format.
pub default_format: String,
/// Whether to print anything to the console or not. TODO: IMPLEMENT
/// Whether to print anything to the console or not.
pub quiet: bool,
/// The maximum number of results to show.
@ -147,10 +154,10 @@ pub struct PerformanceConfig {
///
/// A depth of `1` includes all files under the current directory, a depth of `2` also includes
/// all files under subdirectories of the current directory, etc.
pub max_depth: Option<usize>, // TODO: IMPLEMENT
pub max_depth: Option<usize>,
/// The minimum depth for reported entries, or `None`.
pub min_depth: Option<usize>, // TODO: IMPLEMENT
pub min_depth: Option<usize>,
/// Whether to stop traversing into matching directories.
pub prune: bool,
@ -190,6 +197,33 @@ impl Default for PerformanceConfig {
}
}
/// A single user-defined label rule from config.
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
pub struct ConfigLabelRule {
pub matchers: Vec<String>,
/// "source", "sanitizer", or "sink"
pub kind: String,
/// Capability name: "html_escape", "shell_escape", "url_encode", "json_parse",
/// "env_var", "file_io", or "all"
pub cap: String,
}
/// Per-language analysis configuration from config file.
#[derive(Debug, Serialize, Deserialize, Clone, Default, PartialEq, Eq)]
#[serde(default)]
pub struct LanguageAnalysisConfig {
pub rules: Vec<ConfigLabelRule>,
pub terminators: Vec<String>,
pub event_handlers: Vec<String>,
}
/// Top-level analysis rules config, keyed by language slug.
#[derive(Debug, Serialize, Deserialize, Clone, Default, PartialEq, Eq)]
#[serde(default)]
pub struct AnalysisRulesConfig {
pub languages: HashMap<String, LanguageAnalysisConfig>,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(default)]
#[derive(Default)]
@ -198,10 +232,16 @@ pub struct Config {
pub database: DatabaseConfig,
pub output: OutputConfig,
pub performance: PerformanceConfig,
pub analysis: AnalysisRulesConfig,
}
impl Config {
pub fn load(config_dir: &Path) -> NyxResult<Self> {
/// Load config and return `(config, optional_note)`.
///
/// The note is a formatted status message about which config file was
/// loaded (or that defaults are in use). The caller decides whether to
/// print it based on output format / quiet mode.
pub fn load(config_dir: &Path) -> NyxResult<(Self, Option<String>)> {
let mut config = Config::default();
let default_config_path = config_dir.join("nyx.conf");
@ -210,33 +250,33 @@ impl Config {
}
let user_config_path = config_dir.join("nyx.local");
if user_config_path.exists() {
let note = if user_config_path.exists() {
let user_config_content = fs::read_to_string(&user_config_path)?;
let user_config: Config = toml::from_str(&user_config_content)?;
config = merge_configs(config, user_config);
println!(
Some(format!(
"{}: Loaded user config from: {}\n",
style("note").green().bold(),
style(user_config_path.display())
.underlined()
.white()
.bold()
);
))
} else {
println!(
"{}: Using {} configuration.\n Create file in '{}'to customize.\n",
Some(format!(
"{}: Using {} configuration.\n Create file in '{}' to customize.\n",
style("note").green().bold(),
style("default").bold(),
style(user_config_path.display())
.underlined()
.white()
.bold()
);
}
))
};
Ok(config)
Ok((config, note))
}
}
@ -262,6 +302,7 @@ fn merge_configs(mut default: Config, user: Config) -> Config {
default.scanner.one_file_system = user.scanner.one_file_system;
default.scanner.follow_symlinks = user.scanner.follow_symlinks;
default.scanner.scan_hidden_files = user.scanner.scan_hidden_files;
default.scanner.include_nonprod = user.scanner.include_nonprod;
// Merge exclusion lists (default ⊔ user), then sort & dedupe
default
@ -299,6 +340,32 @@ fn merge_configs(mut default: Config, user: Config) -> Config {
default.performance.scan_timeout_secs = user.performance.scan_timeout_secs;
default.performance.memory_limit_mb = user.performance.memory_limit_mb;
// --- AnalysisRulesConfig ---
for (lang, user_lang_cfg) in user.analysis.languages {
let entry = default.analysis.languages.entry(lang).or_default();
// Union-merge rules with dedup
for rule in user_lang_cfg.rules {
if !entry.rules.contains(&rule) {
entry.rules.push(rule);
}
}
// Union-merge terminators with dedup
for t in user_lang_cfg.terminators {
if !entry.terminators.contains(&t) {
entry.terminators.push(t);
}
}
// Union-merge event_handlers with dedup
for eh in user_lang_cfg.event_handlers {
if !entry.event_handlers.contains(&eh) {
entry.event_handlers.push(eh);
}
}
}
default
}
@ -318,6 +385,72 @@ fn merge_configs_dedupes_and_keeps_order() {
);
}
#[test]
fn merge_analysis_rules_unions_and_dedupes() {
let mut default_cfg = Config::default();
default_cfg.analysis.languages.insert(
"javascript".into(),
LanguageAnalysisConfig {
rules: vec![ConfigLabelRule {
matchers: vec!["escapeHtml".into()],
kind: "sanitizer".into(),
cap: "html_escape".into(),
}],
terminators: vec!["process.exit".into()],
event_handlers: vec![],
},
);
let mut user_cfg = Config::default();
user_cfg.analysis.languages.insert(
"javascript".into(),
LanguageAnalysisConfig {
rules: vec![
ConfigLabelRule {
matchers: vec!["escapeHtml".into()],
kind: "sanitizer".into(),
cap: "html_escape".into(),
},
ConfigLabelRule {
matchers: vec!["sanitizeUrl".into()],
kind: "sanitizer".into(),
cap: "url_encode".into(),
},
],
terminators: vec!["process.exit".into(), "abort".into()],
event_handlers: vec!["addEventListener".into()],
},
);
let merged = merge_configs(default_cfg, user_cfg);
let js = merged.analysis.languages.get("javascript").unwrap();
assert_eq!(js.rules.len(), 2); // deduped
assert_eq!(js.terminators, vec!["process.exit", "abort"]);
assert_eq!(js.event_handlers, vec!["addEventListener"]);
}
#[test]
fn analysis_config_toml_roundtrip() {
let toml_str = r#"
[analysis.languages.javascript]
terminators = ["process.exit"]
event_handlers = ["addEventListener"]
[[analysis.languages.javascript.rules]]
matchers = ["escapeHtml"]
kind = "sanitizer"
cap = "html_escape"
"#;
let cfg: Config = toml::from_str(toml_str).unwrap();
let js = cfg.analysis.languages.get("javascript").unwrap();
assert_eq!(js.rules.len(), 1);
assert_eq!(js.rules[0].matchers, vec!["escapeHtml"]);
assert_eq!(js.rules[0].kind, "sanitizer");
assert_eq!(js.rules[0].cap, "html_escape");
assert_eq!(js.terminators, vec!["process.exit"]);
assert_eq!(js.event_handlers, vec!["addEventListener"]);
}
#[test]
fn load_creates_example_and_reads_user_overrides() {
let cfg_dir = tempfile::tempdir().unwrap();
@ -333,7 +466,7 @@ fn load_creates_example_and_reads_user_overrides() {
"#;
fs::write(cfg_path.join("nyx.local"), user_toml).unwrap();
let cfg = Config::load(cfg_path).expect("Config::load should succeed");
let (cfg, _note) = Config::load(cfg_path).expect("Config::load should succeed");
assert!(cfg_path.join("nyx.conf").is_file());

View file

@ -61,6 +61,11 @@ fn build_overrides(root: &Path, cfg: &Config) -> ignore::overrides::Override {
tracing::warn!("invalid excludedir pattern {dir}: {e}");
}
}
for file in &cfg.scanner.excluded_files {
if let Err(e) = ob.add(&format!("!{file}")) {
tracing::warn!("invalid excludefile pattern {file}: {e}");
}
}
ob.build().unwrap_or_else(|e| {
tracing::error!("failed to build ignore overrides: {e}");
@ -83,6 +88,9 @@ pub fn spawn_file_walker(root: &Path, cfg: &Config) -> (Receiver<Paths>, JoinHan
let follow = cfg.scanner.follow_symlinks;
let max_bytes = cfg.scanner.max_file_size_mb.unwrap_or(0) * 1_048_576;
let batch_size = cfg.performance.batch_size;
let max_depth = cfg.performance.max_depth;
let same_file_system = cfg.scanner.one_file_system;
let require_git = cfg.scanner.require_git_to_read_vcsignore;
// ----- 3 the background walker thread ---------------------------------
let handle = thread::spawn(move || {
@ -96,11 +104,18 @@ pub fn spawn_file_walker(root: &Path, cfg: &Config) -> (Receiver<Paths>, JoinHan
"starting directory walk"
);
WalkBuilder::new(root)
let mut builder = WalkBuilder::new(root);
builder
.hidden(!scan_hidden)
.follow_links(follow)
.threads(workers)
.overrides(overrides)
.same_file_system(same_file_system)
.require_git(require_git);
if let Some(depth) = max_depth {
builder.max_depth(Some(depth));
}
builder
.filter_entry(|e| {
e.file_type()
.map(|ft| ft.is_dir() || ft.is_file())