Feat/full cfg (#30)

* feat: Enhance control flow analysis with function summaries and taint analysis * feat: Update taint analysis to utilize function summaries for enhanced tracking * Refactor `walk.rs` batch processing and override handling: - Renamed `Batcher` to `BatchSender` for clarity. - Added `BatchSender::new` constructor for cleaner initialization. - Simplified batch size management in `BatchSender`. - Extracted `build_overrides` function for reusable override construction. - Improved error handling and validation in override building. - Enhanced performance with directory and file type filtering in `walk`. * Improve logging and streamline directory walk process: - Added detailed `tracing` logs for debugging batch flushes, override construction, and walk initialization/completion. - Optimized and simplified `filter_entry` logic for directory and file type filters. - Improved metadata checks and max file size enforcement during the scan. * Refactor and optimize taint tracking, label rules, and directory walk process: - Replaced `DefaultHasher` with `blake3::Hasher` for improved taint hashing. - Enhanced sorting and hashing logic in `taint.rs` for consistency and efficiency. - Removed unused `set_hash` function and redundant imports across files. - Improved batch sender logic in `walk.rs`, renaming key components for clarity. - Unified `spawn_senders` and `spawn_file_walker` with thread handling and channel tuple return. - Expanded label rules with additional matchers for sources, sanitizers, and sinks. - Deprecated `dump_cfg` and specific logging utilities in `cfg.rs` for code cleanup. * fix: fixed let chains error in walk.rs * fix: updated dependencies * fix: updated dependencies * chore: Remove standard error in scan.rs * feat: Introduce function summaries for enhanced taint and control flow analysis * feat: Enhance taint analysis with interop support and function summaries * feat: Add configuration analysis module and enhance matcher rules * feat: Add arity column to function_summaries and handle schema migration * fix: fixed clippy &PathBuf warnings * chore: Update dependencies and versioning in Cargo files * docs: Update README to enhance clarity and detail on features and analysis modes * chore: Update CHANGELOG for version 0.2.0 with new features, changes, and fixes * docs: Update SECURITY.md to clarify version support status --------- Co-authored-by: elipeter <eli.peter@es.fcm.travel>
2026-07-03 20:41:00 +02:00 · 2026-02-24 23:44:07 -05:00 · 2026-02-24 23:44:07 -05:00 · f96a89e7c1
commit f96a89e7c1
parent 8cbbec7d90
87 changed files with 11505 additions and 1099 deletions
--- a/examples/cfg_analysis/example.js
+++ b/examples/cfg_analysis/example.js
@ -0,0 +1,74 @@
+/**
+ EXPECTED OUTPUT (high-level):
+
+ 1) cfg-unguarded-sink (High / High confidence)
+ - handler(req,res): source req.body.cmd flows to child_process.exec(cmd) without sanitizer/guard.
+ - Should rank high (entry-point-ish function name 'handler', close to entry).
+
+ 2) cfg-auth-gap (High / Medium)
+ - handler is entry-point-ish (name matches handler/route/api conventions).
+ - No auth guard dominates sink (require_auth / is_authenticated / is_admin / authorize).
+
+ 3) cfg-error-fallthrough (Medium / Medium)
+ - Example: if (err) { console.log(err); } then exec(...) still runs.
+ - This is the JS analogue of your Go heuristic. If your implementation only targets Go, this should be NO finding.
+ If you later generalize, this file includes a pattern you can test against.
+
+ 4) cfg-unguarded-sink (HTML) (Medium/High)
+ - req.query.html is written into innerHTML without DOMPurify.sanitize
+
+ 5) No findings for safe paths:
+ - safeHandler uses encodeURIComponent before exec (URL_ENCODE sanitizer) OR uses a dedicated sanitizer you map to SHELL_ESCAPE.
+ NOTE: encodeURIComponent is URL_ENCODE, not SHELL_ESCAPE — so for SHELL_ESCAPE sinks, it may still be flagged depending on your caps logic.
+ The “definitely safe” case here uses a dummy sanitize_shell() wrapper to match your Rust-style naming if you add it for JS later.
+ - safeHtml uses DOMPurify.sanitize before innerHTML (HTML_ESCAPE).
+
+ Taint / dataflow:
+ - should find taint from req.body / req.query / process.env sources to exec/eval/innerHTML sinks.
+ */
+
+const child_process = require("child_process");
+
+// ─── Entry-point-ish + unguarded shell sink + auth gap ────────────────────────────
+function handler(req, res) {
+    // Source (Cap::all): req.body
+    const cmd = req.body.cmd;
+
+    // Vulnerable sink (Cap::SHELL_ESCAPE): child_process.exec
+    child_process.exec(cmd);
+
+    res.end("ok");
+}
+
+// ─── Guarded HTML sink (should NOT be flagged) ────────────────────────────────────
+function safeHtml(req, res, DOMPurify) {
+    const html = req.query.html; // Source
+    const cleaned = DOMPurify.sanitize(html); // Sanitizer(HTML_ESCAPE)
+    document.getElementById("app").innerHTML = cleaned; // Sink(HTML_ESCAPE)
+    res.end("ok");
+}
+
+// ─── Unguarded HTML sink (should be flagged) ─────────────────────────────────────
+function unsafeHtml(req, res) {
+    const html = req.query.html; // Source
+    document.getElementById("app").innerHTML = html; // Sink(HTML_ESCAPE) without sanitizer
+    res.end("ok");
+}
+
+// ─── Heuristic error fallthrough pattern (JS analogue) ───────────────────────────
+// If your error-handling analysis is Go-only, ignore this for now.
+// If generalized later, it should be flagged.
+function errFallthrough(req, res) {
+    const err = req.query.err;
+    if (err) {
+        console.log(err);
+    }
+    child_process.exec(req.body.cmd);
+    res.end("ok");
+}
+
+// ─── Optional: eval sink (should be flagged) ─────────────────────────────────────
+function evalSink(req) {
+    const payload = process.env.PAYLOAD; // Source
+    eval(payload); // Sink(SHELL_ESCAPE) per your rules
+}
--- a/examples/cfg_analysis/example.rs
+++ b/examples/cfg_analysis/example.rs
@ -0,0 +1,99 @@
+/*!
+EXPECTED OUTPUT (high-level):
+
+1) cfg-unguarded-sink (High / High confidence)
+   - In handle_request(): user input from std::env::var("INPUT") flows to std::process::Command::new("sh").arg(&input)
+   - No dominating SHELL_ESCAPE sanitizer or validation guard for that value.
+   - This should rank very high in scoring (entry-point-ish name + close to entry + shell sink).
+
+2) cfg-auth-gap (High / Medium confidence)
+   - handle_request() looks like an entry-point (name matches handle_*)
+   - Contains a shell sink without an auth guard (require_auth / is_authenticated / is_admin etc.)
+
+3) cfg-resource-leak (Medium / High or Medium confidence)
+   - alloc_then_return_leak(): malloc without free on an early return path.
+
+4) cfg-unreachable-sanitizer or cfg-unreachable-guard (Medium/Low)
+   - unreachable_sanitizer(): sanitizer call in unreachable block.
+
+5) taint / dataflow (existing BFS taint engine):
+   - should detect at least one taint finding for:
+       env::var source -> Command sink
+   - should NOT flag safe_shell() because it uses shell_escape::unix::escape(&input) and passes `safe`.
+
+Notes:
+- This fixture intentionally contains both vulnerable and safe patterns, plus unreachable code and resource misuse,
+  to exercise cfg_analysis::{unreachable, guards, auth, resources, scoring}.
+*/
+
+use std::process::Command;
+
+// ─── CFG: Entry-point-ish + unguarded sink + auth gap ─────────────────────────────
+
+pub fn handle_request() {
+  // Source (Cap::all)
+  let input = std::env::var("INPUT").unwrap();
+
+  // Vulnerable sink (Cap::SHELL_ESCAPE)
+  Command::new("sh").arg(&input).status().unwrap();
+}
+
+// ─── CFG: Guarded sink (should NOT produce cfg-unguarded-sink) ────────────────────
+
+pub fn safe_shell() {
+  let input = std::env::var("INPUT").unwrap();
+
+  // Sanitizer (Cap::SHELL_ESCAPE)
+  let safe = shell_escape::unix::escape(&input);
+
+  // Sink, but guarded by dominating sanitizer
+  Command::new("sh").arg(&safe).status().unwrap();
+}
+
+// ─── CFG: Unreachable sanitizer (should report unreachable sanitizer/guard) ───────
+
+pub fn unreachable_sanitizer() {
+  let input = std::env::var("INPUT").unwrap();
+
+  return;
+
+  // This block is unreachable; should produce an unreachable finding for sanitizer call.
+  let _safe = shell_escape::unix::escape(&input);
+}
+
+// ─── CFG: Resource misuse (malloc without free on some exit path) ─────────────────
+
+extern "C" {
+  fn malloc(size: usize) -> *mut u8;
+  fn free(ptr: *mut u8);
+}
+
+pub fn alloc_then_return_leak(flag: bool) {
+  unsafe {
+    let p = malloc(128);
+
+    // Early return leaks `p` on this path.
+    if flag {
+      return;
+    }
+
+    free(p);
+  }
+}
+
+// ─── Extra: HTML sink labeling sanity (optional) ──────────────────────────────────
+
+// `sink_html` is a test marker recognized as Sink(HTML_ESCAPE) by the label rules.
+// In real code this would be something like response.body(), template.render(), etc.
+fn sink_html(_s: &str) {}
+
+pub fn html_print() {
+  let raw = std::env::var("HTML").unwrap();
+  sink_html(&raw);
+}
+
+pub fn html_print_sanitized() {
+  let raw = std::env::var("HTML").unwrap();
+  let safe = html_escape::encode_safe(&raw);
+  sink_html(&safe);
+}
--- a/examples/cross-file/config.rs
+++ b/examples/cross-file/config.rs
@ -0,0 +1,36 @@
+// ─────────────────────────────────────────────────────────────────────────────
+// examples/cross-file/config.rs — Sources
+//
+// This module reads untrusted data from the environment and filesystem.
+// Every public function here acts as a **source** — its return value
+// carries taint.
+//
+// ┌─────────────────────────────────────────────────────────────────────────┐
+// │  FuncSummary produced by pass 1:                                       │
+// │                                                                        │
+// │  get_user_command  → source_caps: ALL, sink: 0, sanitizer: 0           │
+// │  get_config_path   → source_caps: ALL, sink: 0, sanitizer: 0           │
+// │  load_template     → source_caps: ALL, sink: 0, sanitizer: 0           │
+// └─────────────────────────────────────────────────────────────────────────┘
+// ─────────────────────────────────────────────────────────────────────────────
+
+use std::env;
+use std::fs;
+
+/// Reads a user-supplied command from the environment.
+/// Taint: SOURCE(ALL) — caller must sanitise before passing to any sink.
+pub fn get_user_command() -> String {
+    env::var("USER_CMD").unwrap_or_default()
+}
+
+/// Reads a path from the environment.
+/// Taint: SOURCE(ALL)
+pub fn get_config_path() -> String {
+    env::var("CONFIG_PATH").unwrap_or_default()
+}
+
+/// Reads an HTML template from disk (path is trusted, *content* is not).
+/// Taint: SOURCE(ALL)
+pub fn load_template(path: &str) -> String {
+    fs::read_to_string(path).unwrap_or_default()
+}
--- a/examples/cross-file/exec.rs
+++ b/examples/cross-file/exec.rs
@ -0,0 +1,41 @@
+// ─────────────────────────────────────────────────────────────────────────────
+// examples/cross-file/exec.rs — Sinks
+//
+// Functions that perform dangerous operations.  Passing tainted data to
+// these without the matching sanitiser is a vulnerability.
+//
+// ┌─────────────────────────────────────────────────────────────────────────┐
+// │  FuncSummary produced by pass 1:                                       │
+// │                                                                        │
+// │  run_command      → sink_caps: SHELL_ESCAPE, tainted_sink_params: [0]  │
+// │  render_page      → sink_caps: HTML_ESCAPE,  tainted_sink_params: [0]  │
+// │  log_and_execute  → sink_caps: SHELL_ESCAPE, source_caps: ALL          │
+// │                     (both a source AND a sink!)                         │
+// └─────────────────────────────────────────────────────────────────────────┘
+// ─────────────────────────────────────────────────────────────────────────────
+
+use std::env;
+use std::process::Command;
+
+/// Executes a shell command.
+/// Taint: SINK(SHELL_ESCAPE) on `cmd` (param 0).
+pub fn run_command(cmd: &str) {
+    Command::new("sh").arg(cmd).status().unwrap();
+}
+
+/// Renders user content into an HTML page.
+/// Taint: SINK(HTML_ESCAPE) on `body` (param 0).
+pub fn render_page(body: &str) {
+    println!("<html><body>{body}</body></html>");
+}
+
+/// Reads an env var *and* shells out — a function that is simultaneously
+/// a source (return value) and a sink (cmd parameter).
+///
+/// This exercises the "independent caps" design: source_caps and sink_caps
+/// are both non-zero on the same summary.
+pub fn log_and_execute(cmd: &str) -> String {
+    let log_path = env::var("LOG_PATH").unwrap_or_default();
+    Command::new("sh").arg(cmd).status().unwrap();
+    log_path
+}
--- a/examples/cross-file/main.rs
+++ b/examples/cross-file/main.rs
@ -0,0 +1,148 @@
+// ─────────────────────────────────────────────────────────────────────────────
+// examples/cross-file/main.rs — The caller
+//
+// This file calls functions from config.rs, sanitize.rs, and exec.rs.
+// It never directly touches std::env, std::fs, or std::process — every
+// source, sanitiser, and sink lives in another file.
+//
+// Nyx's two-pass cross-file taint analysis should:
+//   • Pass 1: summarise config.rs, sanitize.rs, exec.rs
+//   • Pass 2: resolve calls in main.rs against those summaries
+//
+// ─────────────────────────────────────────────────────────────────────────────
+//
+//  EXPECTED NYX OUTPUT
+//  ===================
+//
+//  examples/cross-file/main.rs
+//    12:5   [High]  taint-unsanitised-flow       ← case_1_direct_source_to_sink
+//    22:5   [High]  taint-unsanitised-flow       ← case_3_wrong_sanitiser
+//    34:5   [High]  taint-unsanitised-flow       ← case_5_passthrough_preserves_taint
+//    40:5   [High]  taint-unsanitised-flow       ← case_6_taint_through_branch
+//    50:5   [High]  taint-unsanitised-flow       ← case_8_source_and_sink_same_fn
+//
+//  examples/cross-file/exec.rs
+//    30:5   [High]  taint-unsanitised-flow       ← log_and_execute internal vuln
+//
+//  NO findings expected for:
+//    case_2  (correct sanitiser applied)
+//    case_4  (correct html sanitiser applied)
+//    case_7  (sanitised before branch)
+//
+// ─────────────────────────────────────────────────────────────────────────────
+
+// ─── Case 1: Direct source → sink (UNSAFE) ──────────────────────────────────
+//
+//   get_user_command() returns tainted(ALL)
+//   run_command() is a sink(SHELL_ESCAPE)
+//   No sanitiser in between → FINDING
+//
+fn case_1_direct_source_to_sink() {
+    let cmd = get_user_command();           // tainted(ALL) via cross-file source
+    run_command(&cmd);                      // FINDING: taint reaches shell sink
+}
+
+// ─── Case 2: Correctly sanitised (SAFE) ─────────────────────────────────────
+//
+//   get_user_command() returns tainted(ALL)
+//   sanitize_shell() strips SHELL_ESCAPE
+//   run_command() sinks SHELL_ESCAPE → bit is gone → no finding
+//
+fn case_2_sanitised_before_sink() {
+    let cmd = get_user_command();           // tainted(ALL)
+    let safe = sanitize_shell(&cmd);        // SHELL_ESCAPE bit stripped
+    run_command(&safe);                     // SAFE — no finding
+}
+
+// ─── Case 3: Wrong sanitiser for the sink (UNSAFE) ──────────────────────────
+//
+//   get_user_command() returns tainted(ALL)
+//   sanitize_html() strips HTML_ESCAPE — but NOT SHELL_ESCAPE
+//   run_command() sinks SHELL_ESCAPE → bit still set → FINDING
+//
+fn case_3_wrong_sanitiser() {
+    let cmd = get_user_command();           // tainted(ALL)
+    let wrong = sanitize_html(&cmd);        // strips HTML_ESCAPE only
+    run_command(&wrong);                    // FINDING: SHELL_ESCAPE still set
+}
+
+// ─── Case 4: Correct HTML sanitiser (SAFE) ──────────────────────────────────
+//
+//   load_template() returns tainted(ALL) from file read
+//   sanitize_html() strips HTML_ESCAPE
+//   render_page() sinks HTML_ESCAPE → bit is gone → no finding
+//
+fn case_4_html_sanitised() {
+    let tpl = load_template("page.html");   // tainted(ALL) via cross-file source
+    let safe = sanitize_html(&tpl);         // HTML_ESCAPE bit stripped
+    render_page(&safe);                     // SAFE — no finding
+}
+
+// ─── Case 5: Passthrough preserves taint (UNSAFE) ───────────────────────────
+//
+//   get_user_command() returns tainted(ALL)
+//   passthrough() propagates taint unchanged (propagates_taint = true)
+//   run_command() sinks SHELL_ESCAPE → still tainted → FINDING
+//
+fn case_5_passthrough_preserves_taint() {
+    let cmd = get_user_command();           // tainted(ALL)
+    let same = passthrough(&cmd);           // taint flows through
+    run_command(&same);                     // FINDING: still tainted
+}
+
+// ─── Case 6: Taint flows through only one branch (UNSAFE) ───────────────────
+//
+//   One branch sanitises, the other does not.
+//   The unsanitised branch reaches the sink → FINDING on that path.
+//
+fn case_6_taint_through_branch() {
+    let cmd = get_user_command();           // tainted(ALL)
+    if cmd.len() > 10 {
+        run_command(&cmd);                  // FINDING: unsanitised path
+    } else {
+        let safe = sanitize_shell(&cmd);
+        run_command(&safe);                 // SAFE path
+    }
+}
+
+// ─── Case 7: Sanitised before branch (SAFE) ─────────────────────────────────
+//
+//   Sanitisation happens before the branch → both paths are clean.
+//
+fn case_7_sanitised_before_branch() {
+    let cmd = get_user_command();           // tainted(ALL)
+    let safe = sanitize_shell(&cmd);        // SHELL_ESCAPE stripped
+    if safe.len() > 10 {
+        run_command(&safe);                 // SAFE
+    } else {
+        run_command(&safe);                 // SAFE
+    }
+}
+
+// ─── Case 8: Source-and-sink function (UNSAFE) ──────────────────────────────
+//
+//   log_and_execute() is both:
+//     • a SINK(SHELL_ESCAPE) on its cmd parameter
+//     • a SOURCE(ALL) in its return value (reads env var)
+//
+//   Passing tainted data to it → FINDING for the sink.
+//   Its return value is freshly tainted, but we don't pass it anywhere
+//   dangerous here — so only one finding.
+//
+fn case_8_source_and_sink_same_fn() {
+    let cmd = get_user_command();           // tainted(ALL)
+    let _log = log_and_execute(&cmd);       // FINDING: tainted arg hits shell sink
+    // _log is now tainted(ALL) from log_and_execute's source behaviour,
+    // but we don't use it — no second finding.
+}
+
+fn main() {
+    case_1_direct_source_to_sink();
+    case_2_sanitised_before_sink();
+    case_3_wrong_sanitiser();
+    case_4_html_sanitised();
+    case_5_passthrough_preserves_taint();
+    case_6_taint_through_branch();
+    case_7_sanitised_before_branch();
+    case_8_source_and_sink_same_fn();
+}
--- a/examples/cross-file/sanitize.rs
+++ b/examples/cross-file/sanitize.rs
@ -0,0 +1,30 @@
+// ─────────────────────────────────────────────────────────────────────────────
+// examples/cross-file/sanitize.rs — Sanitizers
+//
+// Functions that clean specific taint capabilities.  After passing through
+// one of these, the corresponding Cap bit is stripped.
+//
+// ┌─────────────────────────────────────────────────────────────────────────┐
+// │  FuncSummary produced by pass 1:                                       │
+// │                                                                        │
+// │  sanitize_shell  → sanitizer_caps: SHELL_ESCAPE, propagates: true      │
+// │  sanitize_html   → sanitizer_caps: HTML_ESCAPE,  propagates: true      │
+// │  passthrough     → sanitizer: 0, source: 0, sink: 0, propagates: true  │
+// └─────────────────────────────────────────────────────────────────────────┘
+// ─────────────────────────────────────────────────────────────────────────────
+
+/// Escapes shell metacharacters.  Strips the SHELL_ESCAPE cap bit.
+pub fn sanitize_shell(input: &str) -> String {
+    shell_escape::unix::escape(input.into()).to_string()
+}
+
+/// Escapes HTML entities.  Strips the HTML_ESCAPE cap bit.
+pub fn sanitize_html(input: &str) -> String {
+    html_escape::encode_safe(input).to_string()
+}
+
+/// Does nothing security-relevant — just returns a copy.
+/// Taint passes straight through (propagates_taint = true).
+pub fn passthrough(input: &str) -> String {
+    input.to_string()
+}
--- a/examples/single-func/example.rs
+++ b/examples/single-func/example.rs
@ -0,0 +1,8 @@
+fn source_env(var: &str) -> String {
+    env::var(var).unwrap_or_default()                          // Source(env-var)
+}
+
+fn main() {
+    let raw = source_env("USER_CMD");
+    Command::new("sh").arg(raw).status().unwrap();
+}
--- a/examples/standard/test.rs
+++ b/examples/standard/test.rs
@ -1,9 +1,30 @@
-use std::{env, process::Command};
-fn main() {
-  let y = env::var("SAFE").unwrap();
+fn source_env(var: &str) -> String {
+    env::var(var).unwrap_or_default()                          // Source(env-var)
+}

-  let x = env::var("DANGEROUS").unwrap();
-  let clean = html_escape::encode_safe(&y);
-  Command::new("sh").arg(x).status().unwrap();
-  Command::new("sh").arg(clean).status().unwrap();
+fn source_file(path: &str) -> String {
+    fs::read_to_string(path).unwrap_or_default()               // Source(file-io)
+}
+
+fn sink_shell(arg: &str) {
+    Command::new("sh").arg(arg).status().unwrap();             // Sink(process-spawn)
+}
+
+fn sink_html(out: &str) {
+    println!("{out}");                                         // Sink(html-out)
+}
+
+fn main() {
+    let raw = source_env("USER_CMD");
+    let raw2 = source_file("ANOTHER");
+    let x = source_env("ANOTHER");
+    if x.len() > 5 {
+        sink_shell(&x);                     // EXPECT: UNSAFE
+        return;
+    } else {
+        let escaped = sanitize_shell(&x);
+        sink_shell(&escaped);               // safe
+    }
+    sink_shell(raw);                       // EXPECT: UNSAFE
+    sink_html(raw2);
 }