diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ded1a99..05ea48fd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,12 +5,18 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [Unreleased] +## [0.2.0-alpha] - 2025-06-28 + +### Added +- Experimental intra‑procedural CFG + taint analysis for Rust. Nyx now builds a control‑flow graph, applies data‑flow rules, and flags unsanitised Source → Sink paths (e.g. env::var → Command::new). +- O(1) node‑kind lookup via per‑language PHF tables for zero‑cost dispatch. +- Six unit tests covering conditionals, loops, sanitizers, and multiple sources. +- Debug channel target=cfg (use RUST_LOG=nyx::cfg=debug) to inspect generated graphs. ### Fixed - Fixed a bug in the release pipeline where Windows was trying to call the zip, PowerShell doesn't have a zip command -## [0.1.1] - 2025-06-25 +## [0.1.1-alpha] - 2025-06-25 ### Fixed - Fixed a bug where the `scan --no-index` command would not respect the `max_results` config setting (#1) @@ -18,7 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - Integration tests covering indexing and scanning pipelines (#3, #4, #5, #8) -## [0.1.0] - 2025-06-25 +## [0.1.0-alpha] - 2025-06-25 ### Added - Initial alpha release of **Nyx** CLI tool diff --git a/Cargo.lock b/Cargo.lock index 824ce5c2..e570effc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + [[package]] name = "android-tzdata" version = "0.1.1" @@ -62,7 +68,7 @@ version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c8bdeb6047d8983be085bab0ba1472e6dc604e7041dbf6fcd5e71523014fae9" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -73,7 +79,7 @@ checksum = "403f75924867bb1033c59fbf0797484329750cfbe3c4325cd33127941fabc882" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -210,15 +216,15 @@ checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "console" -version = "0.15.11" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8" +checksum = "2e09ced7ebbccb63b4c65413d821f2e00ce54c5ca4514ddc6b3c892fdbcbc69d" dependencies = [ "encode_unicode", "libc", "once_cell", "unicode-width", - "windows-sys", + "windows-sys 0.60.2", ] [[package]] @@ -308,7 +314,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys", + "windows-sys 0.60.2", ] [[package]] @@ -336,7 +342,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad" dependencies = [ "libc", - "windows-sys", + "windows-sys 0.60.2", ] [[package]] @@ -357,6 +363,12 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + [[package]] name = "foldhash" version = "0.1.5" @@ -405,6 +417,8 @@ version = "0.15.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5" dependencies = [ + "allocator-api2", + "equivalent", "foldhash", ] @@ -608,8 +622,9 @@ dependencies = [ [[package]] name = "nyx-scanner" -version = "0.1.1" +version = "0.2.0-alpha" dependencies = [ + "bitflags", "blake3", "bytesize", "chrono", @@ -621,6 +636,8 @@ dependencies = [ "ignore", "num_cpus", "once_cell", + "petgraph", + "phf", "r2d2", "r2d2_sqlite", "rayon", @@ -688,7 +705,62 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "petgraph" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54acf3a685220b533e437e264e4d932cfbdc4cc7ec0cd232ed73c08d03b8a7ca" +dependencies = [ + "fixedbitset", + "hashbrown", + "indexmap", + "serde", +] + +[[package]] +name = "phf" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" +dependencies = [ + "phf_macros", + "phf_shared", + "serde", +] + +[[package]] +name = "phf_generator" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cbb1126afed61dd6368748dae63b1ee7dc480191c6262a3b4ff1e29d86a6c5b" +dependencies = [ + "fastrand", + "phf_shared", +] + +[[package]] +name = "phf_macros" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d713258393a82f091ead52047ca779d37e5766226d009de21696c4e667044368" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "phf_shared" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" +dependencies = [ + "siphasher", ] [[package]] @@ -901,7 +973,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -997,6 +1069,12 @@ version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +[[package]] +name = "siphasher" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" + [[package]] name = "smallvec" version = "1.15.1" @@ -1036,7 +1114,7 @@ dependencies = [ "getrandom 0.3.3", "once_cell", "rustix", - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -1482,7 +1560,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys", + "windows-sys 0.59.0", ] [[package]] @@ -1556,7 +1634,16 @@ version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.2", ] [[package]] @@ -1565,14 +1652,30 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c66f69fcc9ce11da9966ddb31a40968cad001c5bedeb5c2b82ede4253ab48aef" +dependencies = [ + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", ] [[package]] @@ -1581,48 +1684,96 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + [[package]] name = "windows_i686_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + [[package]] name = "winnow" version = "0.7.11" diff --git a/Cargo.toml b/Cargo.toml index b1ed089f..6a4d1e22 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "nyx-scanner" -version = "0.1.1" +version = "0.2.0-alpha" edition = "2024" description = "A CLI security scanner for automating vulnerability checks" license = "GPL-3.0" @@ -49,10 +49,13 @@ tree-sitter-ruby = "0.23.1" crossbeam-channel = "0.5.15" blake3 = "1.8.2" once_cell = "1.21.3" -console = "0.15.11" +console = "0.16.0" rayon = "1.10.0" r2d2 = "0.8.10" bytesize = "2.0.1" chrono = { version = "0.4.41", default-features = false, features = ["std", "clock"] } thiserror = "2.0.12" dashmap = "7.0.0-rc2" +petgraph = "0.8.2" +bitflags = "2.9.1" +phf = { version = "0.12.1", features = ["macros"] } diff --git a/README.md b/README.md index 0d78268d..401ea902 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,11 @@
nyx logo -# Nyx - **Fast, cross-language cli vulnerability scanner.** [![crates.io](https://img.shields.io/crates/v/nyx-scanner.svg)](https://crates.io/crates/nyx-scanner) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) -[![Rust](https://img.shields.io/badge/rust-1.70+-orange.svg)](https://www.rust-lang.org) +[![Rust 1.85+](https://img.shields.io/badge/rust-1.85%2B-orange)](https://www.rust-lang.org) [![CI](https://img.shields.io/github/actions/workflow/status/ecpeter23/nyx/ci.yml?branch=master)](https://github.com/ecpeter23/nyx/actions)
@@ -18,7 +16,7 @@ **Nyx** is a lightweight lightning-fast Rust‑native command‑line tool that detects potentially dangerous code patterns across several programming languages. It combines the accuracy of [`tree‑sitter`](https://tree-sitter.github.io/) parsing with a curated rule set and an optional SQLite‑backed index to deliver fast, repeatable scans on projects of any size. > **Project status – Alpha** -> Nyx is under active development. The public interface, rule set, and output formats may change without notice while we stabilize the core. Please pin exact versions in production environments. +> Nyx is under active development. The public interface, rule set, and output formats may change without notice while we stabilise the core. The new CFG + taint engine is experimental and Rust-only for now – please report any crashes or false-positives. Pin exact versions in production environments --- @@ -50,17 +48,49 @@ ## Installation +### Install crate +```bash +$ cargo install nyx-scanner +``` + +### Install Github release +1. Navigate to the [Releases](https://github.com/ecpeter23/nyx/releases) page of the repository. +2. Download the appropriate binary for your system: + + ```nyx-x86_64-unknown-linux-gnu.zip``` for Linux + + ```nyx-x86_64-pc-windows-msvc.zip``` for Windows + + ```nyx-x86_64-apple-darwin.zip``` or ```nyx-aarch64-apple-darwin.zip``` for macOS (Intel or Apple Silicon) + +3. Unzip the file and move the executable to a directory in your system PATH: + ```bash + # Example for Unix systems + unzip nyx-x86_64-unknown-linux-gnu.zip + chmod +x nyx + sudo mv nyx /usr/local/bin/ + ``` + ```bash + # Example for Windows in PowerShell + Expand-Archive -Path nyx-x86_64-pc-windows-msvc.zip -DestinationPath . + Move-Item -Path .\nyx.exe -Destination "C:\Program Files\Nyx\" # Add to PATH manually if needed + ``` + +4. Verify the installation: + ```bash + nyx --version + ``` ### Build from source ```bash -$ git clone https://github.com//nyx.git +$ git clone https://github.com/ecpeter23/nyx.git $ cd nyx $ cargo build --release # optional – copy the binary into PATH $ cargo install --path . ``` -Nyx targets **stable Rust 1.78 or later**. +Nyx targets **stable Rust 1.85 or later**. --- @@ -142,18 +172,29 @@ A fully documented `nyx.conf` is generated automatically on first run. ## Roadmap -| Area | Planned Improvements | -|-----------------------|---------------------------------------------------------------------------| -| More language support | Plans to create rule sets for over 100 languages for maximum coverage | -| Control‑flow analysis | Generation of CFGs for deeper reasoning about execution paths | -| Taint tracking | Intra‑ / inter‑procedural tracing of untrusted data from sources to sinks | -| Output formats | Full SARIF 2.1.0, JUnit XML, HTML report generator | -| Rule updates | Remote rule feed with signature verification | +| Area | Planned Improvements | +|-----------------------|-------------------------------------------------------------------------------------------------------| +| More language support | Plans to create rule sets for over 100 languages for maximum coverage | +| Control‑flow analysis | Inter‑procedural function summaries. Cap label propagation & bit‑flag checks. Loop/branch sensitivity | +| Taint tracking | Intra‑ / inter‑procedural tracing of untrusted data from sources to sinks | +| Output formats | Full SARIF 2.1.0, JUnit XML, HTML report generator | +| Rule updates | Remote rule feed with signature verification | +| Performance & UX | Incremental CFG cache, progress‑bar UX, smart file‑watch re‑scan | Community feedback will help shape priorities; please open an issue to discuss proposed changes. --- +## Experimental Features & Feedback + +The new Rust intra‑procedural CFG + taint engine is not enabled. + +Expect rough edges: slightly slower scans, occasional false positives, limited language coverage. + +Please open an issue for every crash, panic, or suspicious result – attach the minimal code snippet and mention the Nyx version. + +--- + ## Contributing Pull requests are welcome. To contribute: diff --git a/default-nyx.conf b/default-nyx.conf index 9a6d32e7..e996772a 100644 --- a/default-nyx.conf +++ b/default-nyx.conf @@ -8,6 +8,10 @@ [scanner] +## If full uses both ast patterns and cfg taint analysis, +## Possible values: full | ast | cfg +mode = "full" + ## Minimum severity level to include in the report ## Possible values: Low | Medium | High | Critical min_severity = "Low" @@ -96,6 +100,9 @@ batch_size = 100 ## Channel capacity multiplier (capacity = threads × this) channel_multiplier = 4 +## Maximum stack size for Rayon threads (bytes) +rayon_thread_stack_size = 8 * 1024 * 1024 # 8 MiB + ## Timeout on individual files (seconds); null = none (UNIMPLEMENTED) scan_timeout_secs = null diff --git a/examples/sanatize/example.rs b/examples/sanatize/example.rs new file mode 100644 index 00000000..c01f2923 --- /dev/null +++ b/examples/sanatize/example.rs @@ -0,0 +1,96 @@ +//! demo.rs — realistic taint-tracking playground +//! `cargo add html-escape shell-escape` before compiling. + +use std::{env, process::Command, fs}; + +#[derive(Default)] +struct UserCtx { + query: String, // potentially tainted + sanitized: String, // should remain clean +} + +/// ---------- helper wrappers so we get nice Source / Sink labels ---------- +fn source_env(var: &str) -> String { + env::var(var).unwrap_or_default() // Source(env-var) +} + +fn source_file(path: &str) -> String { + fs::read_to_string(path).unwrap_or_default() // Source(file-io) +} + +fn sink_shell(arg: &str) { + Command::new("sh").arg(arg).status().unwrap(); // Sink(process-spawn) +} + +fn sink_html(out: &str) { + println!("{out}"); // Sink(html-out) +} + +fn sanitize_html(s: &str) -> String { + html_escape::encode_safe(s) // Sanitizer(html-escape) +} + +fn sanitize_shell(s: &str) -> String { + shell_escape::unix::escape(s.into()).into_owned() // Sanitizer(shell-escape) +} + +/// ---------- 1. Main demo fuction ---------- +fn main() { + // FLOW A ──────────────────────────────────────────────────────────────── + // env → sanitized → safe shell + let raw = source_env("USER_CMD"); + let clean = sanitize_shell(&raw); + sink_shell(&clean); // EXPECT: SAFE + + // FLOW B ──────────────────────────────────────────────────────────────── + // env → if-else, only one branch escapes + let arg = source_env("ANOTHER"); + if arg.len() > 5 { + sink_shell(&arg); // EXPECT: UNSAFE (branch tainted) + } else { + let escaped = sanitize_shell(&arg); + sink_shell(&escaped); // safe + } + + // FLOW C ──────────────────────────────────────────────────────────────── + // file → while loop → HTML sanitizer cleared + let mut data = source_file("/tmp/input.txt"); + while data.len() < 32 { + data.push('x'); + } + let html_ok = sanitize_html(&data); + sink_html(&html_ok); // safe + + // FLOW D ──────────────────────────────────────────────────────────────── + // file → struct field → match → unsanitised HTML + let mut ctx = UserCtx::default(); + ctx.query = source_file("/tmp/q.txt"); + // overwrite the clean field; `ctx.sanitized` is *not* tainted + ctx.sanitized = sanitize_html("constant"); + match ctx { + UserCtx { query, sanitized } if query.contains("DROP") => { + sink_html(&query); // EXPECT: UNSAFE + } + _ => { + sink_html(&ctx.sanitized); // safe + } + } + + // FLOW E ──────────────────────────────────────────────────────────────── + // source → function call → reassignment clears taint + let mut name = source_env("USER"); // tainted + greet(&name); // just prints + name = "anonymous".into(); // kills taint + greet(&name); // safe + + // FLOW F ──────────────────────────────────────────────────────────────── + // Multiple sanitizers, only the *right* one matters + let cmd = source_env("MIXED"); + let partly = sanitize_html(&cmd); // wrong sanitizer + sink_shell(&partly); // EXPECT: UNSAFE +} + +/// helper (non-sink) function +fn greet(who: &str) { + println!("Hello, {who}"); +} \ No newline at end of file diff --git a/examples/standard/test.rs b/examples/standard/test.rs new file mode 100644 index 00000000..ff89b18e --- /dev/null +++ b/examples/standard/test.rs @@ -0,0 +1,9 @@ +use std::{env, process::Command}; +fn main() { + let y = env::var("SAFE").unwrap(); + + let x = env::var("DANGEROUS").unwrap(); + let clean = html_escape::encode_safe(&y); + Command::new("sh").arg(x).status().unwrap(); + Command::new("sh").arg(clean).status().unwrap(); +} \ No newline at end of file diff --git a/src/ast.rs b/src/ast.rs index 6b3ac445..6fdadb12 100644 --- a/src/ast.rs +++ b/src/ast.rs @@ -1,5 +1,8 @@ +use crate::cfg::{analyse_function, build_cfg}; use crate::commands::scan::Diag; use crate::errors::{NyxError, NyxResult}; +use crate::patterns::Severity; +use crate::utils::config::AnalysisMode; use crate::utils::ext::lowercase_ext; use crate::utils::{Config, query_cache}; use std::cell::RefCell; @@ -10,6 +13,16 @@ thread_local! { static PARSER: RefCell = RefCell::new(tree_sitter::Parser::new()); } +/// Convenience alias for node indices. +fn byte_offset_to_point(tree: &tree_sitter::Tree, byte: usize) -> tree_sitter::Point { + // `descendant_for_byte_range` gives us *some* node that starts at `byte`, + // `start_position` turns that into rows & columns (both 0-based) + tree.root_node() + .descendant_for_byte_range(byte, byte) + .map(|n| n.start_position()) + .unwrap_or_else(|| tree_sitter::Point { row: 0, column: 0 }) +} + pub(crate) fn run_rules_on_file(path: &Path, cfg: &Config) -> NyxResult> { tracing::debug!("Running rules on: {}", path.display()); let bytes = std::fs::read(path)?; @@ -47,30 +60,58 @@ pub(crate) fn run_rules_on_file(path: &Path, cfg: &Config) -> NyxResult, // taint classification if any + pub defines: Option, // variable written by this stmt + pub uses: Vec, // variables read +} + +pub type Cfg = Graph; + +// ------------------------------------------------------------------------- +// Utility helpers +// ------------------------------------------------------------------------- + +/// Return the text of a node. +#[inline] +fn text_of<'a>(n: Node<'a>, code: &'a [u8]) -> Option { + std::str::from_utf8(&code[n.start_byte()..n.end_byte()]) + .ok() + .map(|s| s.to_string()) +} + +/// Return the callee identifier for the first call / method / macro inside `n`. +fn first_call_ident<'a>(n: Node<'a>, lang: &str, code: &'a [u8]) -> Option { + let mut cursor = n.walk(); + for c in n.children(&mut cursor) { + match lookup(lang, c.kind()) { + Kind::CallFn | Kind::CallMethod | Kind::CallMacro => { + // Re-use the same logic we have in `push_node` + return match lookup(lang, c.kind()) { + Kind::CallFn => c + .child_by_field_name("function") + .and_then(|f| text_of(f, code)), + Kind::CallMethod => { + let func = c + .child_by_field_name("method") + .or_else(|| c.child_by_field_name("name")) + .and_then(|f| text_of(f, code)); + let recv = c + .child_by_field_name("object") + .and_then(|f| text_of(f, code)); + match (recv, func) { + (Some(r), Some(f)) => Some(format!("{r}::{f}")), + (_, Some(f)) => Some(f.to_string()), + _ => None, + } + } + Kind::CallMacro => c + .child_by_field_name("macro") + .and_then(|f| text_of(f, code)), + _ => None, + }; + } + _ => {} + } + } + None +} + +/// Create a node in one short borrow and optionally attach a taint label. +fn push_node<'a>( + g: &mut Cfg, + kind: StmtKind, + ast: Node<'a>, + lang: &str, + code: &'a [u8], +) -> NodeIndex { + /* ── 1. IDENTIFIER EXTRACTION ─────────────────────────────────────── */ + + // Primary guess (varies by AST kind) + let mut text = match lookup(lang, ast.kind()) { + // plain `foo(bar)` style call + Kind::CallFn => ast + .child_by_field_name("function") + .and_then(|n| text_of(n, code)) + .unwrap_or_default(), + + // method / UFCS call `recv.method()` or `Type::func()` + Kind::CallMethod => { + let func = ast + .child_by_field_name("method") + .or_else(|| ast.child_by_field_name("name")) + .and_then(|n| text_of(n, code)); + let recv = ast + .child_by_field_name("object") + .and_then(|n| text_of(n, code)); + match (recv, func) { + (Some(r), Some(f)) => format!("{r}::{f}"), + (_, Some(f)) => f, + _ => String::new(), + } + } + + // `my_macro!(…)` + Kind::CallMacro => ast + .child_by_field_name("macro") + .and_then(|n| text_of(n, code)) + .unwrap_or_default(), + + // everything else – fallback to raw slice + _ => text_of(ast, code).unwrap_or_default(), + }; + + // If this is a `let` or `expression_statement` that *contains* a call, + // prefer the first inner call identifier instead of the whole line. + if matches!(lookup(lang, ast.kind()), Kind::CallWrapper) { + if let Some(inner) = first_call_ident(ast, lang, code) { + text = inner; + } + } + + /* ── 2. LABEL LOOK-UP ───────────────────────────────────────────── */ + + let label = classify(lang, &text); + let span = (ast.start_byte(), ast.end_byte()); + + /* ── 3. GRAPH INSERTION + DEBUG ──────────────────────────────────── */ + + let (defines, uses) = def_use(ast, code); + + let idx = g.add_node(NodeInfo { + kind, + span, + label, + defines, + uses, + }); + + debug!( + target: "cfg", + "node {} ← {:?} txt=`{}` span={:?} label={:?}", + idx.index(), + kind, + text, + span, + label + ); + idx +} + +/// Add the same edge (of the same kind) from every node in `froms` to `to`. +#[inline] +fn connect_all(g: &mut Cfg, froms: &[NodeIndex], to: NodeIndex, kind: EdgeKind) { + for &f in froms { + debug!(target: "cfg", "edge {} → {} ({:?})", f.index(), to.index(), kind); + g.add_edge(f, to, kind); + } +} + +// ------------------------------------------------------------------------- +// The recursive *work‑horse* that converts an AST node into a CFG slice. +// Returns the set of *exit* nodes that need to be wired further. +// ------------------------------------------------------------------------- +fn build_sub<'a>( + ast: Node<'a>, + preds: &[NodeIndex], // predecessor frontier + g: &mut Cfg, + lang: &str, + code: &'a [u8], +) -> Vec { + match lookup(lang, ast.kind()) { + // ───────────────────────────────────────────────────────────────── + // IF‑/ELSE: two branches that re‑merge afterwards + // ───────────────────────────────────────────────────────────────── + Kind::If => { + // Condition node + let cond = push_node(g, StmtKind::If, ast, lang, code); + connect_all(g, preds, cond, EdgeKind::Seq); + + // Locate then & else blocks + let (then_block, else_block) = { + let mut cursor = ast.walk(); + let blocks: Vec<_> = ast + .children(&mut cursor) + .filter(|n| n.kind() == "block") + .collect(); + (blocks.first().copied(), blocks.get(1).copied()) + }; + + // THEN branch + let then_exits = if let Some(b) = then_block { + let exits = build_sub(b, &[cond], g, lang, code); + // True edges leave the condition + if let Some(&first) = exits.first() { + connect_all(g, &[cond], first, EdgeKind::True); + } + exits + } else { + vec![cond] + }; + + // ELSE branch + let else_exits = if let Some(b) = else_block { + let exits = build_sub(b, &[cond], g, lang, code); + if let Some(&first) = exits.first() { + connect_all(g, &[cond], first, EdgeKind::False); + } + exits + } else { + // No explicit else → non-taken branch flows to the *then* exits + if let Some(&first) = then_exits.first() { + connect_all(g, &[cond], first, EdgeKind::False); + } + then_exits.clone() + }; + + // Frontier = union of both branches + then_exits.into_iter().chain(else_exits).collect() + } + + Kind::InfiniteLoop => { + // Synthetic header node + let header = push_node(g, StmtKind::Loop, ast, lang, code); + connect_all(g, preds, header, EdgeKind::Seq); + + // The body is the single `block` child + let body = ast.child_by_field_name("body").expect("loop without body"); + let body_exits = build_sub(body, &[header], g, lang, code); + + // Back-edge from every linear exit to header + for &e in &body_exits { + connect_all(g, &[e], header, EdgeKind::Back); + } + // `loop` may break → those exits are frontiers too + body_exits.into_iter().chain([header]).collect() + } + + // ───────────────────────────────────────────────────────────────── + // WHILE / FOR: classic loop with a back edge. + // ───────────────────────────────────────────────────────────────── + Kind::While | Kind::For => { + let header = push_node(g, StmtKind::Loop, ast, lang, code); + connect_all(g, preds, header, EdgeKind::Seq); + + // Body = first (and usually only) block child. + let body = ast + .child_by_field_name("body") + .or_else(|| { + let mut c = ast.walk(); + ast.children(&mut c).find(|n| n.kind() == "block") + }) + .expect("loop without body"); + + let body_exits = build_sub(body, &[header], g, lang, code); + + // Back‑edge for every linear exit → header. + for &e in &body_exits { + connect_all(g, &[e], header, EdgeKind::Back); + } + // Falling out of the loop = header’s false branch. + vec![header] + } + + // ───────────────────────────────────────────────────────────────── + // Control-flow sinks (return / break / continue). + // ───────────────────────────────────────────────────────────────── + Kind::Return => { + let ret = push_node(g, StmtKind::Return, ast, lang, code); + connect_all(g, preds, ret, EdgeKind::Seq); + Vec::new() // terminates this path + } + Kind::Break => { + let brk = push_node(g, StmtKind::Break, ast, lang, code); + connect_all(g, preds, brk, EdgeKind::Seq); + Vec::new() + } + Kind::Continue => { + let cont = push_node(g, StmtKind::Continue, ast, lang, code); + connect_all(g, preds, cont, EdgeKind::Seq); + Vec::new() + } + + // ───────────────────────────────────────────────────────────────── + // BLOCK: statements execute sequentially + // ───────────────────────────────────────────────────────────────── + Kind::SourceFile | Kind::Block => { + let mut cursor = ast.walk(); + let mut frontier = preds.to_vec(); + for child in ast.children(&mut cursor) { + frontier = build_sub(child, &frontier, g, lang, code); + } + frontier + } + + // Function item – create a header and dive into its body + Kind::Function => { + let header = push_node(g, StmtKind::Seq, ast, lang, code); + connect_all(g, preds, header, EdgeKind::Seq); + + if let Some(body) = ast.child_by_field_name("body") { + build_sub(body, &[header], g, lang, code) + } else { + vec![header] // declaration w/o body + } + } + + // Statements that **may** contain a call --------------------------------- + Kind::CallWrapper => { + let mut cursor = ast.walk(); + + if let Some(inner) = ast.children(&mut cursor).find(|c| { + matches!( + lookup(lang, c.kind()), + Kind::InfiniteLoop | Kind::While | Kind::For | Kind::If + ) + }) { + return build_sub(inner, preds, g, lang, code); + } + + let has_call = ast.children(&mut cursor).any(|c| { + matches!( + lookup(lang, c.kind()), + Kind::CallFn | Kind::CallMethod | Kind::CallMacro + ) + }); + + let kind = if has_call { + StmtKind::Call + } else { + StmtKind::Seq + }; + let node = push_node(g, kind, ast, lang, code); + connect_all(g, preds, node, EdgeKind::Seq); + vec![node] + } + + // Trivia we drop completely --------------------------------------------- + // "line_comment" | "block_comment" + // | ";" | "," | "(" | ")" | "{" | "}" | "\n" + // | "use_declaration" + // | "attribute_item" + // | "mod_item" | "type_item" + Kind::Trivia => preds.to_vec(), + + // ───────────────────────────────────────────────────────────────── + // Every other node = simple sequential statement + // ───────────────────────────────────────────────────────────────── + _ => { + let n = push_node(g, StmtKind::Seq, ast, lang, code); + connect_all(g, preds, n, EdgeKind::Seq); + vec![n] + } + } +} + +// ------------------------------------------------------------------------- +// === PUBLIC ENTRY POINT ================================================= +// ------------------------------------------------------------------------- + +/// Build an intraprocedural CFG and return (graph, entry_node). +/// +/// * Walks the Tree‑Sitter AST. +/// * Creates `StmtKind::*` nodes only for *statement‑level* constructs to keep +/// the graph compact. +/// * Wires a synthetic `Entry` node in front and a synthetic `Exit` node after +/// all real sinks. +pub(crate) fn build_cfg<'a>(tree: &'a Tree, code: &'a [u8], lang: &str) -> (Cfg, NodeIndex) { + debug!(target: "cfg", "Building CFG for {:?}", tree.root_node()); + + let mut g: Cfg = Graph::with_capacity(128, 256); + let entry = g.add_node(NodeInfo { + kind: StmtKind::Entry, + span: (0, 0), + label: None, + defines: None, + uses: Vec::new(), + }); + let exit = g.add_node(NodeInfo { + kind: StmtKind::Exit, + span: (code.len(), code.len()), + label: None, + defines: None, + uses: Vec::new(), + }); + + // Build the body below the synthetic ENTRY. + let exits = build_sub(tree.root_node(), &[entry], &mut g, lang, code); + + // Wire every real exit to our synthetic EXIT node. + for e in exits { + connect_all(&mut g, &[e], exit, EdgeKind::Seq); + } + + debug!(target: "cfg", "CFG DONE — nodes: {}, edges: {}", g.node_count(), g.edge_count()); + + if cfg!(debug_assertions) { + // List every node + for idx in g.node_indices() { + debug!(target: "cfg", " node {:>3}: {:?}", idx.index(), g[idx]); + } + // List every edge + for e in g.edge_references() { + debug!( + target: "cfg", + " edge {:>3} → {:<3} ({:?})", + e.source().index(), + e.target().index(), + e.weight() + ); + } + + // Reachability check + let mut reachable: HashSet = Default::default(); + let mut bfs = Bfs::new(&g, entry); + while let Some(nx) = bfs.next(&g) { + reachable.insert(nx); + } + debug!( + target: "cfg", + "reachable nodes: {}/{}", + reachable.len(), + g.node_count() + ); + if reachable.len() != g.node_count() { + let unreachable: Vec<_> = g + .node_indices() + .filter(|i| !reachable.contains(i)) + .collect(); + debug!(target: "cfg", "‼︎ unreachable nodes: {:?}", unreachable); + } + + // (Optional) Dominator tree sanity check + let doms: Dominators<_> = simple_fast(&g, entry); + debug!(target: "cfg", "dominator tree computed (len = {:?})", doms); + } + + (g, entry) +} + +/* ---------- TAINT-ANALYSIS PASSES ---------- */ +/// Recursively collect every identifier that occurs inside `n`. +fn collect_idents(n: Node, code: &[u8], out: &mut Vec) { + if n.kind() == "identifier" { + if let Some(txt) = text_of(n, code) { + out.push(txt); + } + } else { + let mut c = n.walk(); + for ch in n.children(&mut c) { + collect_idents(ch, code, out); + } + } +} + +/// Return `(defines, uses)` for the AST fragment `ast`. +fn def_use(ast: Node, code: &[u8]) -> (Option, Vec) { + match ast.kind() { + // `let = ;` + "let_declaration" => { + let mut defs = None; + let mut uses = Vec::new(); + + if let Some(pat) = ast.child_by_field_name("pattern") { + // first identifier inside the pattern = variable name + let mut tmp = Vec::::new(); + collect_idents(pat, code, &mut tmp); + defs = tmp.into_iter().next(); + } + if let Some(val) = ast.child_by_field_name("value") { + collect_idents(val, code, &mut uses); + } + (defs, uses) + } + + // Plain assignment `x = y + z` + "assignment_expression" => { + let mut defs = None; + let mut uses = Vec::new(); + if let Some(lhs) = ast.child_by_field_name("left") { + let mut tmp = Vec::::new(); + collect_idents(lhs, code, &mut tmp); + defs = tmp.pop(); + } + if let Some(rhs) = ast.child_by_field_name("right") { + collect_idents(rhs, code, &mut uses); + } + (defs, uses) + } + + // everything else – no definition, but may read vars + _ => { + let mut uses = Vec::new(); + collect_idents(ast, code, &mut uses); + (None, uses) + } + } +} + +fn set_hash(s: &HashSet) -> u64 { + let mut v: Vec<_> = s.iter().collect(); + v.sort(); // deterministic + let mut h = DefaultHasher::new(); + v.hash(&mut h); + h.finish() +} + +fn apply_taint(node: &NodeInfo, taint: &HashSet) -> HashSet { + let mut out = taint.clone(); + + match node.label { + // A new untrusted value enters the program + Some(DataLabel::Source(_)) => { + if let Some(d) = &node.defines { + out.insert(d.clone()); + } + } + // Anything written by a sanitizer becomes clean – whatever its + // arguments were is irrelevant here. + Some(DataLabel::Sanitizer(_)) => { + if let Some(d) = &node.defines { + out.remove(d); + } + } + + // A function call *returning* tainted/clean data ---------------------- + // (`let v = source_*()` or `let v = sanitize_*(x)`) + _ if node.kind == StmtKind::Call => { + if let Some(d) = &node.defines { + match node.label { + Some(DataLabel::Source(_)) => { + out.insert(d.clone()); + } // gen + Some(DataLabel::Sanitizer(_)) => { + out.remove(d); + } // kill + _ => { /* normal flow handled below */ } + } + } + } + + // All other statements: classic gen/kill for assignments + _ => { + if let Some(d) = &node.defines { + let rhs_tainted = node.uses.iter().any(|u| out.contains(u)); + if rhs_tainted { + out.insert(d.clone()); + } else { + out.remove(d); + } + } + } + } + + out +} + +pub fn analyse_function(cfg: &Cfg, entry: NodeIndex) -> Vec> { + use std::collections::{HashMap, HashSet, VecDeque}; + + /// Queue item: current CFG node + taint map that holds here + #[derive(Clone)] + struct Item { + node: NodeIndex, + taint: HashSet, + } + + // (node, taint_hash) → predecessor key (for path rebuild) + type Key = (NodeIndex, u64); + let mut pred: HashMap = HashMap::new(); + + // Seen states so we do not revisit them infinitely + let mut seen: HashSet = HashSet::new(); + + // Resulting Source→Sink paths + let mut findings: Vec> = Vec::new(); + + let mut q = VecDeque::new(); + q.push_back(Item { + node: entry, + taint: HashSet::new(), + }); + seen.insert((entry, 0)); + + while let Some(Item { node, taint }) = q.pop_front() { + let updated = apply_taint(&cfg[node], &taint); // step effect + + /* ---------- SINK CHECK ---------- */ + if let Some(DataLabel::Sink(_)) = cfg[node].label { + if cfg[node].uses.iter().any(|u| updated.contains(u)) { + // reconstruct path back to *any* Source + let mut p: Vec = vec![node]; + let mut k = (node, set_hash(&taint)); // predecessor key + + while let Some(&(prev, _)) = pred.get(&k) { + p.push(prev); + if matches!(cfg[prev].label, Some(DataLabel::Source(_))) { + break; + } + // climb further + let prev_hash = pred.get(&k).map(|(_, h)| *h).unwrap_or(0); + k = (prev, prev_hash); + } + p.reverse(); + findings.push(p); + } + } + + /* ---------- BFS successor step ---------- */ + for succ in cfg.neighbors(node) { + let key = (succ, set_hash(&updated)); + if !seen.contains(&key) { + seen.insert(key); + pred.insert(key, (node, set_hash(&taint))); + q.push_back(Item { + node: succ, + taint: updated.clone(), + }); + } + } + } + + findings +} + +#[test] +fn env_to_arg_is_flagged() { + use tree_sitter::Language; + let src = br#" + use std::env; use std::process::Command; + fn main() { + let x = env::var("DANGEROUS_ARG").unwrap(); + Command::new("sh").arg(x).status().unwrap(); + }"#; + + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) + .unwrap(); + let tree = parser.parse(src as &[u8], None).unwrap(); + + let (cfg, entry) = build_cfg(&tree, src, "rust"); + let findings = analyse_function(&cfg, entry); + + assert_eq!(findings.len(), 1); // exactly one unsanitised Source→Sink +} + +#[test] +fn taint_through_if_else() { + use tree_sitter::Language; + let src = br#" + use std::env; use std::process::Command; + fn main() { + let x = env::var("DANGEROUS").unwrap(); + let safe = html_escape::encode_safe(&x); + + if x.len() > 5 { + Command::new("sh").arg(&x).status().unwrap(); // UNSAFE + } else { + Command::new("sh").arg(&safe).status().unwrap(); // SAFE + } + }"#; + + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) + .unwrap(); + let tree = parser.parse(src as &[u8], None).unwrap(); + + let (cfg, entry) = build_cfg(&tree, src, "rust"); + let findings = analyse_function(&cfg, entry); + + // exactly one path (via the True branch) should be flagged + assert_eq!(findings.len(), 1); +} + +#[test] +fn taint_through_while_loop() { + use tree_sitter::Language; + let src = br#" + use std::{env, process::Command}; + fn main() { + let mut x = env::var("DANGEROUS").unwrap(); + while x.len() < 100 { // Loop header (Loop) + x.push_str("a"); + } + Command::new("sh").arg(x).status().unwrap(); // Should be flagged + }"#; + + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) + .unwrap(); + let tree = parser.parse(src as &[u8], None).unwrap(); + + let (cfg, entry) = build_cfg(&tree, src, "rust"); + let findings = analyse_function(&cfg, entry); + assert_eq!(findings.len(), 1); +} + +#[test] +fn taint_killed_by_sanitizer() { + use tree_sitter::Language; + let src = br#" + use std::{env, process::Command}; + fn main() { + let x = env::var("DANGEROUS").unwrap(); + let clean = html_escape::encode_safe(&x); // sanitizer node + Command::new("sh").arg(clean).status().unwrap(); // SAFE + }"#; + + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) + .unwrap(); + let tree = parser.parse(src as &[u8], None).unwrap(); + + let (cfg, entry) = build_cfg(&tree, src, "rust"); + let findings = analyse_function(&cfg, entry); + assert!(findings.is_empty()); +} + +#[test] +fn taint_breaks_out_of_loop() { + use tree_sitter::Language; + let src = br#" + use std::{env, process::Command}; + fn main() { + loop { + let x = env::var("DANGEROUS").unwrap(); + Command::new("sh").arg(&x).status().unwrap(); // vulnerable + break; + } + }"#; + + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) + .unwrap(); + let tree = parser.parse(src as &[u8], None).unwrap(); + + let (cfg, entry) = build_cfg(&tree, src, "rust"); + let findings = analyse_function(&cfg, entry); + assert_eq!(findings.len(), 1); +} + +#[test] +fn test_two_sources() { + use tree_sitter::Language; + let src = br#" + use std::{env, process::Command}; + fn main() { + let x = env::var("DANGEROUS").unwrap(); + let y = env::var("SAFE").unwrap(); + let clean = html_escape::encode_safe(&y); + Command::new("sh").arg(x).status().unwrap(); + Command::new("sh").arg(clean).status().unwrap(); + }"#; + + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) + .unwrap(); + let tree = parser.parse(src as &[u8], None).unwrap(); + + let (cfg, entry) = build_cfg(&tree, src, "rust"); + let findings = analyse_function(&cfg, entry); + assert_eq!(findings.len(), 1); +} + +#[test] +fn test_should_not_panic_on_empty_function() { + use tree_sitter::Language; + let src = br#" + use std::{env, process::Command}; + fn f() { + if cond() { + return; + } + do_something(); + }"#; + + let mut parser = tree_sitter::Parser::new(); + parser + .set_language(&Language::from(tree_sitter_rust::LANGUAGE)) + .unwrap(); + let tree = parser.parse(src as &[u8], None).unwrap(); + + let (cfg, entry) = build_cfg(&tree, src, "rust"); + let findings = analyse_function(&cfg, entry); + assert!(findings.is_empty()); +} diff --git a/src/cli.rs b/src/cli.rs index 9b03018b..19a1df0f 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -32,6 +32,15 @@ pub enum Commands { /// Show only high severity issues #[arg(long)] high_only: bool, + + #[arg(long)] + ast_only: bool, + + #[arg(long)] + cfg_only: bool, + + #[arg(long)] + all_targets: bool, }, /// Manage project indexes diff --git a/src/commands/clean.rs b/src/commands/clean.rs index 336de2f0..6932ae40 100644 --- a/src/commands/clean.rs +++ b/src/commands/clean.rs @@ -12,7 +12,7 @@ pub fn handle(project: Option, all: bool, config_dir: &std::path::Path) } println!("{}", style("✔ All indexes cleaned").green().bold()); } else if let Some(proj_name) = project { - let db_path = config_dir.join(format!("{}.sqlite", proj_name)); + let db_path = config_dir.join(format!("{proj_name}.sqlite")); if db_path.exists() { fs::remove_file(&db_path)?; println!( diff --git a/src/commands/mod.rs b/src/commands/mod.rs index eb5ef7a4..e3f8236e 100644 --- a/src/commands/mod.rs +++ b/src/commands/mod.rs @@ -6,7 +6,7 @@ pub mod scan; use crate::cli::Commands; use crate::errors::NyxResult; use crate::patterns::Severity; -use crate::utils::config::Config; +use crate::utils::config::{AnalysisMode, Config}; use std::path::Path; pub fn handle_command( @@ -21,11 +21,26 @@ pub fn handle_command( rebuild_index, format, high_only, + ast_only, + cfg_only, + all_targets, } => { if high_only { config.scanner.min_severity = Severity::High }; + if ast_only { + config.scanner.mode = AnalysisMode::Ast + }; + + if cfg_only { + config.scanner.mode = AnalysisMode::Taint + }; + + if all_targets { + config.scanner.mode = AnalysisMode::Full + }; + scan::handle(&path, no_index, rebuild_index, format, database_dir, config) } Commands::Index { action } => index::handle(action, database_dir, config), diff --git a/src/commands/scan.rs b/src/commands/scan.rs index 00b46d90..098bf3d3 100644 --- a/src/commands/scan.rs +++ b/src/commands/scan.rs @@ -68,7 +68,7 @@ pub fn handle( println!("{}", style(path).blue().underlined()); for d in issues { println!( - " {:>4}:{:<4} [{}] {}", + " {:>4}:{:<4} [{:}] {:}", d.line, d.col, d.severity, @@ -145,6 +145,17 @@ pub fn scan_with_index_parallel( } else { idx.get_issues_from_file(&path).unwrap_or_default() }; + + match cfg.scanner.mode { + crate::utils::config::AnalysisMode::Ast => { + diags.retain(|d| !d.id.starts_with("taint")); + } + crate::utils::config::AnalysisMode::Taint => { + diags.retain(|d| d.id.starts_with("taint")); + } + crate::utils::config::AnalysisMode::Full => {} + } + if !diags.is_empty() { diag_map .entry(path.to_string_lossy().to_string()) diff --git a/src/database.rs b/src/database.rs index f4d5ab0e..c647669d 100644 --- a/src/database.rs +++ b/src/database.rs @@ -16,28 +16,35 @@ pub mod index { const SCHEMA: &str = r#" PRAGMA foreign_keys = ON; - CREATE TABLE IF NOT EXISTS files ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - project TEXT NOT NULL, - path TEXT NOT NULL, - hash BLOB NOT NULL, - mtime INTEGER NOT NULL, + CREATE TABLE IF NOT EXISTS files (id INTEGER PRIMARY KEY AUTOINCREMENT, + project TEXT NOT NULL, + path TEXT NOT NULL, + hash BLOB NOT NULL, + mtime INTEGER NOT NULL, scanned_at INTEGER NOT NULL, UNIQUE(project, path) ); - CREATE TABLE IF NOT EXISTS issues ( - file_id INTEGER NOT NULL + CREATE TABLE IF NOT EXISTS issues (file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE, - rule_id TEXT NOT NULL, - severity TEXT NOT NULL, - line INTEGER NOT NULL, - col INTEGER NOT NULL, - PRIMARY KEY (file_id, rule_id, line, col) - ); + rule_id TEXT NOT NULL, + severity TEXT NOT NULL, + line INTEGER NOT NULL, + col INTEGER NOT NULL, + PRIMARY KEY (file_id, rule_id, line, col)); + + CREATE TABLE IF NOT EXISTS function_summaries (hash TEXT PRIMARY KEY, + project TEXT NOT NULL, + name TEXT NOT NULL, + lang TEXT NOT NULL, + summary TEXT NOT NULL, + updated_at INTEGER NOT NULL); "#; + // TODO: ADD CLEANS FOR EACH TABLE BASED ON PROJECT WHICH RUNS ON CLEAN + // TODO: ADD DROP AND GIVE A CLI PARAMETER FOR DROP + /// A single issue row, ready for insertion. #[derive(Debug, Clone)] pub struct IssueRow<'a> { @@ -189,6 +196,50 @@ pub mod index { Ok(issue_iter.filter_map(Result::ok).collect()) } + // pub fn upsert_summary( + // &mut self, + // project: &str, + // path: &Path, + // hash: &str, + // s: &crate::summary::FuncSummary, + // ) -> NyxResult<()> { + // let conn = self.c(); + // let now = chrono::Utc::now().timestamp_millis(); // i64 + // + // conn.execute( + // "INSERT INTO function_summaries (hash, project, name, lang, summary, updated_at) + // VALUES (?1, ?2, ?3, ?4, ?5, ?6) + // ON CONFLICT(hash) DO UPDATE SET summary = excluded.summary, + // updated_at = excluded.updated_at", + // ( + // hash, + // project, + // &s.name, + // path.extension().and_then(|e| e.to_str()).unwrap_or_default(), + // serde_json::to_string(s).unwrap(), //TODO REPLACE UNWRAP + // now, + // ), + // )?; + // Ok(()) + // } + // + // pub fn load_all_summaries(&self, project: &str) -> NyxResult>> { + // let mut stmt = self + // .c() + // .prepare("SELECT summary FROM function_summaries WHERE project = ?1")?; + // + // let iter = stmt.query_map([project], |row| { + // let json: String = row.get(0)?; + // Ok(serde_json::from_str::(json.as_str()).unwrap()) // TODO: REPLACE UNWRAP + // })?; + // + // Ok(iter + // .collect::, _>>()? + // .into_iter() + // .map(|s| unsafe { std::mem::transmute::<_, crate::summary::FuncSummary<'static>>(s) }) + // .collect()) + // } + /// gets files from the database pub fn get_files(&self, project: &str) -> NyxResult> { let mut stmt = self.c().prepare( @@ -214,6 +265,7 @@ pub mod index { DROP TABLE IF EXISTS issues; DROP TABLE IF EXISTS files; + DROP TABLE IF EXISTS function_summaries; PRAGMA foreign_keys = ON; VACUUM; diff --git a/src/labels/javascript.rs b/src/labels/javascript.rs new file mode 100644 index 00000000..b8b99c21 --- /dev/null +++ b/src/labels/javascript.rs @@ -0,0 +1,17 @@ +use crate::labels::{Cap, DataLabel, LabelRule}; + +// TODO: refactor this +pub static RULES: &[LabelRule] = &[ + LabelRule { + matchers: &["document.location", "window.location"], + label: DataLabel::Source(Cap::all()), + }, + LabelRule { + matchers: &["JSON.parse"], + label: DataLabel::Sanitizer(Cap::JSON_PARSE), + }, + LabelRule { + matchers: &["eval"], + label: DataLabel::Sink(Cap::SHELL_ESCAPE), + }, +]; diff --git a/src/labels/mod.rs b/src/labels/mod.rs new file mode 100644 index 00000000..48d81222 --- /dev/null +++ b/src/labels/mod.rs @@ -0,0 +1,121 @@ +mod javascript; +mod rust; + +use bitflags::bitflags; +use once_cell::sync::Lazy; +use phf::Map; +use std::collections::HashMap; + +/// A single rule: if the AST text equals (or ends with) one of the `matchers`, +/// the node gets `label`. +#[derive(Debug, Clone, Copy)] +pub struct LabelRule { + pub matchers: &'static [&'static str], + pub label: DataLabel, +} + +bitflags! { + #[derive(Debug, Clone, Copy, PartialEq, Eq)] + pub struct Cap: u8 { + const ENV_VAR = 0b0000_0001; + const HTML_ESCAPE = 0b0000_0010; + const SHELL_ESCAPE = 0b0000_0100; + const URL_ENCODE = 0b0000_1000; + const JSON_PARSE = 0b0001_0000; + // ADD MORE + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Kind { + If, + InfiniteLoop, + While, + For, + LoopBody, + CallFn, + CallMethod, + CallMacro, + Break, + Continue, + Return, + Block, + SourceFile, + Function, + Assignment, + CallWrapper, + Trivia, + Other, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DataLabel { + Source(Cap), + Sanitizer(Cap), + Sink(Cap), +} + +static REGISTRY: Lazy> = Lazy::new(|| { + let mut m = HashMap::new(); + m.insert("rust", rust::RULES); + m.insert("rs", rust::RULES); + + m.insert("javascript", javascript::RULES); + m.insert("js", javascript::RULES); + + // add more languages in one line: + // m.insert("go", go::RULES); + + m +}); + +type FastMap = &'static Map<&'static str, Kind>; + +pub(crate) static CLASSIFIERS: Lazy> = Lazy::new(|| { + let mut m = HashMap::new(); + m.insert("rust", &rust::KINDS); + m.insert("rs", &rust::KINDS); + + // m.insert("javascript", &javascript::KINDS); + // m.insert("js", &javascript::KINDS); + + // todo: add more languages + m +}); + +#[inline(always)] +pub fn lookup(lang: &str, raw: &str) -> Kind { + CLASSIFIERS + .get(lang) + .and_then(|m| m.get(raw).copied()) + .unwrap_or(Kind::Other) +} + +/// Try to classify a piece of syntax text. +/// `lang` is the canonicalised language key (“rust”, “javascript”, …). +pub fn classify(lang: &str, text: &str) -> Option { + let key = lang.to_ascii_lowercase(); + let rules = REGISTRY.get(key.as_str())?; + let head = text.split(['(', '<']).next().unwrap_or(""); + + let text_lc = head.trim().to_ascii_lowercase(); + + for rule in *rules { + for raw in rule.matchers { + let m = raw.to_ascii_lowercase(); + + if m.ends_with('_') { + if text_lc.starts_with(&m) { + return Some(rule.label); + } + } else if text_lc.ends_with(&m) { + let start = text_lc.len() - m.len(); + let ok = start == 0 || matches!(text_lc.as_bytes()[start - 1], b'.' | b':'); + if ok { + return Some(rule.label); + } + } + } + } + None +} diff --git a/src/labels/rust.rs b/src/labels/rust.rs new file mode 100644 index 00000000..9a84dbad --- /dev/null +++ b/src/labels/rust.rs @@ -0,0 +1,72 @@ +use crate::labels::{Cap, DataLabel, Kind, LabelRule}; +use phf::{Map, phf_map}; + +pub static RULES: &[LabelRule] = &[ + // ─────────── Sources ─────────── + LabelRule { + matchers: &["std::env::var", "env::var"], + label: DataLabel::Source(Cap::all()), + }, + // ───────── Sanitizers ────────── + // `fn sanitize_*(&str) -> String` + LabelRule { + matchers: &["html_escape::encode_safe", "sanitize_", "sanitize_html"], + label: DataLabel::Sanitizer(Cap::HTML_ESCAPE), + }, + LabelRule { + matchers: &["shell_escape::unix::escape"], + label: DataLabel::Sanitizer(Cap::SHELL_ESCAPE), + }, + // ─────────── Sinks ───────────── + // All the key points where untrusted strings reach the OS shell. + LabelRule { + matchers: &[ + "command::new", + "std::process::command::new", + "command::arg", + "command::args", + "command::status", + "command::output", + ], + label: DataLabel::Sink(Cap::SHELL_ESCAPE), + }, +]; + +pub static KINDS: Map<&'static str, Kind> = phf_map! { + // control-flow + "if_expression" => Kind::If, + "loop_expression" => Kind::InfiniteLoop, + "loop_statement" => Kind::LoopBody, + "while_statement" => Kind::While, + "for_statement" => Kind::For, + + "return_statement" => Kind::Return, + "break_expression" => Kind::Break, + "break_statement" => Kind::Break, + "continue_expression" => Kind::Continue, + "continue_statement" => Kind::Continue, + + // structure + "source_file" => Kind::SourceFile, + "block" => Kind::Block, + "function_item" => Kind::Function, + + // data-flow + "call_expression" => Kind::CallFn, + "method_call_expression" => Kind::CallMethod, + "macro_invocation" => Kind::CallMacro, + "let_declaration" => Kind::CallWrapper, + "expression_statement" => Kind::CallWrapper, + "assignment_expression" => Kind::Assignment, + + // trivia + "line_comment" => Kind::Trivia, + "block_comment" => Kind::Trivia, + ";" => Kind::Trivia, "," => Kind::Trivia, + "(" => Kind::Trivia, ")" => Kind::Trivia, + "{" => Kind::Trivia, "}" => Kind::Trivia, "\n" => Kind::Trivia, + "use_declaration" => Kind::Trivia, + "attribute_item" => Kind::Trivia, + "mod_item" => Kind::Trivia, + "type_item" => Kind::Trivia, +}; diff --git a/src/main.rs b/src/main.rs index d1871a55..d6afbd62 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,8 +1,10 @@ mod ast; +mod cfg; mod cli; mod commands; mod database; mod errors; +mod labels; mod patterns; mod utils; mod walk; @@ -59,6 +61,11 @@ fn main() -> NyxResult<()> { let mut config = Config::load(config_dir)?; + rayon::ThreadPoolBuilder::new() + .stack_size(config.performance.rayon_thread_stack_size) + .build_global() + .expect("set rayon stack size"); + commands::handle_command(cli.command, database_dir, &mut config)?; println!( diff --git a/src/patterns/mod.rs b/src/patterns/mod.rs index 0cc4173d..d90f0194 100644 --- a/src/patterns/mod.rs +++ b/src/patterns/mod.rs @@ -92,7 +92,7 @@ static REGISTRY: Lazy> = Lazy::new(|| m.insert("cpp", cpp::PATTERNS); m.insert("c++", cpp::PATTERNS); - // ---- Other languages in the folder ---- + // ---- Other patterns in the folder ---- m.insert("java", java::PATTERNS); m.insert("go", go::PATTERNS); m.insert("php", php::PATTERNS); @@ -101,14 +101,14 @@ static REGISTRY: Lazy> = Lazy::new(|| m.insert("ruby", ruby::PATTERNS); m.insert("rb", ruby::PATTERNS); - tracing::debug!("AST-pattern registry initialised ({} languages)", m.len()); + tracing::debug!("AST-pattern registry initialised ({} patterns)", m.len()); m }); /// Return all patterns for the requested language (case-insensitive). /// -/// Unknown languages yield an **empty** `Vec`. +/// Unknown patterns yield an **empty** `Vec`. pub fn load(lang: &str) -> Vec { let key = lang.to_ascii_lowercase(); REGISTRY.get(key.as_str()).copied().unwrap_or(&[]).to_vec() diff --git a/src/utils/config.rs b/src/utils/config.rs index 5d5339d1..32048e0e 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -8,9 +8,21 @@ use toml; static DEFAULT_CONFIG_TOML: &str = include_str!("../../default-nyx.conf"); +#[derive(Debug, Serialize, Deserialize, Clone, Copy, Default, PartialEq)] +#[serde(rename_all = "lowercase")] +pub enum AnalysisMode { + #[default] + Full, + Ast, + Taint, +} + #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(default)] pub struct ScannerConfig { + /// The analysis mode to use. + pub mode: AnalysisMode, + /// The minimum severity level to output pub min_severity: Severity, @@ -47,6 +59,7 @@ pub struct ScannerConfig { impl Default for ScannerConfig { fn default() -> Self { Self { + mode: AnalysisMode::Full, min_severity: Severity::Low, max_file_size_mb: None, excluded_extensions: vec![ @@ -151,6 +164,9 @@ pub struct PerformanceConfig { /// capacity = threads × this pub channel_multiplier: usize, + /// The stack size for Rayon threads, in bytes. + pub rayon_thread_stack_size: usize, + /// Timeout on individual files // TODO: IMPLEMENT pub scan_timeout_secs: Option, @@ -167,6 +183,7 @@ impl Default for PerformanceConfig { worker_threads: None, batch_size: 100usize, channel_multiplier: 4usize, + rayon_thread_stack_size: 8 * 1024 * 1024, // 2 MiB scan_timeout_secs: None, memory_limit_mb: 512, } @@ -236,6 +253,7 @@ fn create_example_config(config_dir: &Path) -> NyxResult<()> { /// supply new exclusions and overriding everything else. fn merge_configs(mut default: Config, user: Config) -> Config { // --- ScannerConfig --- + default.scanner.mode = user.scanner.mode; default.scanner.min_severity = user.scanner.min_severity; default.scanner.max_file_size_mb = user.scanner.max_file_size_mb; default.scanner.read_global_ignore = user.scanner.read_global_ignore; @@ -277,6 +295,7 @@ fn merge_configs(mut default: Config, user: Config) -> Config { default.performance.worker_threads = user.performance.worker_threads; default.performance.batch_size = user.performance.batch_size; default.performance.channel_multiplier = user.performance.channel_multiplier; + default.performance.rayon_thread_stack_size = user.performance.rayon_thread_stack_size; default.performance.scan_timeout_secs = user.performance.scan_timeout_secs; default.performance.memory_limit_mb = user.performance.memory_limit_mb; diff --git a/src/utils/project.rs b/src/utils/project.rs index 269ee0e8..ca63887f 100644 --- a/src/utils/project.rs +++ b/src/utils/project.rs @@ -9,7 +9,7 @@ pub fn get_project_info(project_path: &Path, config_dir: &Path) -> NyxResult<(St .ok_or_else(|| NyxError::Other("Unable to determine project name".into()))?; let db_name = sanitize_project_name(project_name); - let db_path = config_dir.join(format!("{}.sqlite", db_name)); + let db_path = config_dir.join(format!("{db_name}.sqlite")); Ok((project_name.to_owned(), db_path)) } @@ -41,7 +41,7 @@ fn sanitize_project_name_is_idempotent_and_lossless_enough() { ]; for (input, expected) in samples { - assert_eq!(sanitize_project_name(input), expected, "input: {}", input); + assert_eq!(sanitize_project_name(input), expected, "input: {input}"); assert_eq!(sanitize_project_name(expected), expected); } }