From 1f2bfe76c120f36466caaecc423e43c1bd861981 Mon Sep 17 00:00:00 2001 From: Eli Peter <54954007+elicpeter@users.noreply.github.com> Date: Sat, 2 May 2026 17:46:45 -0400 Subject: [PATCH] =?UTF-8?q?docs:=20Enhance=20module=20documentation=20acro?= =?UTF-8?q?ss=20various=20files=20for=20clarity=20a=E2=80=A6=20(#62)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs: Enhance module documentation across various files for clarity and completeness * fix: Remove unnecessary blank line in build.rs for cleaner code * docs: Update documentation to improve clarity and consistency in code comments --- .github/workflows/ci.yml | 16 ++ build.rs | 291 +--------------------- src/abstract_interp/path_domain.rs | 4 +- src/ast.rs | 27 +- src/auth_analysis/extract/common.rs | 4 +- src/auth_analysis/mod.rs | 58 ++++- src/auth_analysis/model.rs | 4 +- src/callgraph.rs | 20 +- src/cfg/mod.rs | 18 +- src/cfg_analysis/mod.rs | 48 +++- src/cli.rs | 10 +- src/commands/mod.rs | 8 + src/commands/scan.rs | 12 +- src/constraint/domain.rs | 2 +- src/constraint/lower.rs | 2 +- src/constraint/solver.rs | 2 +- src/convergence_telemetry.rs | 2 +- src/database.rs | 15 +- src/errors.rs | 9 + src/evidence.rs | 15 ++ src/interop.rs | 10 + src/labels/mod.rs | 56 ++++- src/lib.rs | 111 ++++++++- src/output.rs | 9 + src/patterns/mod.rs | 50 +++- src/pointer/analysis.rs | 6 +- src/ssa/heap.rs | 6 +- src/ssa/mod.rs | 18 ++ src/ssa/param_points_to.rs | 4 +- src/state/mod.rs | 53 +++- src/summary/mod.rs | 29 ++- src/summary/points_to.rs | 6 +- src/summary/ssa_summary.rs | 12 +- src/symbol/mod.rs | 12 + src/symex/heap.rs | 6 +- src/taint/backwards.rs | 2 +- src/taint/mod.rs | 75 +++++- src/taint/path_state.rs | 2 +- src/taint/ssa_transfer/mod.rs | 6 +- src/taint/ssa_transfer/state.rs | 6 +- src/taint/ssa_transfer/summary_extract.rs | 2 +- src/utils/config.rs | 16 ++ src/utils/mod.rs | 13 + src/walk.rs | 10 + 44 files changed, 721 insertions(+), 366 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 003a7edd..22117a0e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -153,6 +153,22 @@ jobs: exit 1 fi + rustdoc: + name: rustdoc + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + + - uses: actions-rust-lang/setup-rust-toolchain@v1 + with: + toolchain: stable + cache: true + + - name: Check rustdoc links + env: + RUSTDOCFLAGS: "-D warnings" + run: cargo doc --workspace --no-deps --all-features + rust-beta-build: name: rust-beta-build runs-on: ubuntu-latest diff --git a/build.rs b/build.rs index 37fadd66..34f4a9b1 100644 --- a/build.rs +++ b/build.rs @@ -1,9 +1,7 @@ -use std::path::{Path, PathBuf}; +use std::path::Path; use std::process::Command; fn main() { - render_docs_for_rustdoc(); - // Only relevant when the serve feature is active if std::env::var("CARGO_FEATURE_SERVE").is_err() { return; @@ -58,293 +56,6 @@ fn main() { } } -// --------------------------------------------------------------------------- -// Rustdoc / docs.rs: render docs/*.md into $OUT_DIR with relative .md links -// rewritten to absolute github.com/elicpeter/nyx URLs so they resolve when the -// markdown is embedded in rustdoc via #![doc = include_str!(...)]. -// -// Source of truth stays in docs/. Files that don't exist (published-crate -// builds where docs/ wasn't packaged) fall back to a one-line stub so rustdoc -// still compiles. -// --------------------------------------------------------------------------- - -const GH_DOCS_BASE: &str = "https://github.com/elicpeter/nyx/blob/master/docs"; - -struct DocSpec { - /// Path under docs/, e.g. "how-it-works.md" or "detectors/taint.md". - src: &'static str, - /// Output filename in $OUT_DIR. - out: &'static str, -} - -const DOC_SPECS: &[DocSpec] = &[ - DocSpec { - src: "how-it-works.md", - out: "lib_intro.md", - }, - DocSpec { - src: "detectors/taint.md", - out: "taint.md", - }, - DocSpec { - src: "detectors/cfg.md", - out: "cfg_analysis.md", - }, - DocSpec { - src: "detectors/state.md", - out: "state.md", - }, - DocSpec { - src: "detectors/patterns.md", - out: "patterns.md", - }, - DocSpec { - src: "auth.md", - out: "auth_analysis.md", - }, -]; - -fn render_docs_for_rustdoc() { - let Ok(out_dir) = std::env::var("OUT_DIR") else { - return; - }; - let out_dir = PathBuf::from(out_dir); - let docs_dir = Path::new("docs"); - - for spec in DOC_SPECS { - let src_path = docs_dir.join(spec.src); - println!("cargo:rerun-if-changed=docs/{}", spec.src); - let out_path = out_dir.join(spec.out); - let rendered = match std::fs::read_to_string(&src_path) { - Ok(raw) => rewrite_doc_links(&raw, spec.src), - Err(_) => format!( - "See [`{base}/{src}`]({base}/{src}).\n", - base = GH_DOCS_BASE, - src = spec.src, - ), - }; - if let Err(e) = std::fs::write(&out_path, rendered) { - println!( - "cargo:warning=failed to write rendered doc {}: {}", - out_path.display(), - e - ); - } - } -} - -/// Render markdown for embedding in rustdoc. -/// -/// 1. Rewrites relative `.md` links to absolute github.com URLs: -/// - inline links: `](path.md)` and `](path.md#anchor)` -/// - reference defs: `[id]: path.md` -/// 2. Labels unmarked fenced code blocks as `text` so rustdoc does not try -/// to compile them as Rust (and choke on Unicode like `→`). -/// 3. Annotates `rust` fences with `,ignore` so rustdoc doesn't try to -/// compile or run prose-level snippets as doctests. GitHub still -/// highlights them as Rust because it keys off the first token. -/// -/// Skips link rewriting inside code fences. Skips link rewriting for URLs -/// that are already absolute (have a scheme), pure anchors (`#section`), -/// or non-`.md` paths. -fn rewrite_doc_links(content: &str, source_rel: &str) -> String { - let source_dir = Path::new(source_rel) - .parent() - .map(|p| p.to_string_lossy().into_owned()) - .unwrap_or_default(); - - let mut out = String::with_capacity(content.len() + 256); - let mut in_fence = false; - - for line in content.split_inclusive('\n') { - let body = line.strip_suffix('\n').unwrap_or(line); - let trimmed = body.trim_start(); - if trimmed.starts_with("```") { - let lang = trimmed.trim_start_matches('`').trim(); - if in_fence { - in_fence = false; - out.push_str(line); - } else { - in_fence = true; - let indent_len = body.len() - trimmed.len(); - if lang.is_empty() { - out.push_str(&body[..indent_len]); - out.push_str("```text"); - if line.ends_with('\n') { - out.push('\n'); - } - } else if is_rust_fence_needing_ignore(lang) { - out.push_str(&body[..indent_len]); - out.push_str("```rust,ignore"); - if line.ends_with('\n') { - out.push('\n'); - } - } else { - out.push_str(line); - } - } - continue; - } - if in_fence { - out.push_str(line); - } else { - rewrite_links_in_line(body, &source_dir, &mut out); - if line.ends_with('\n') { - out.push('\n'); - } - } - } - - out -} - -fn rewrite_links_in_line(line: &str, source_dir: &str, out: &mut String) { - let bytes = line.as_bytes(); - let mut i = 0; - while i < bytes.len() { - // Inline link: `](URL)`, markdown URLs do not contain a raw `)`. - if i + 1 < bytes.len() && bytes[i] == b']' && bytes[i + 1] == b'(' { - out.push_str("]("); - i += 2; - let url_start = i; - while i < bytes.len() && bytes[i] != b')' { - i += 1; - } - let url = &line[url_start..i]; - out.push_str(&maybe_rewrite_url(url, source_dir)); - } - // Reference def: `]: URL`. - else if i + 2 < bytes.len() - && bytes[i] == b']' - && bytes[i + 1] == b':' - && bytes[i + 2] == b' ' - { - out.push_str("]: "); - i += 3; - let url_start = i; - while i < bytes.len() && bytes[i] != b' ' { - i += 1; - } - let url = &line[url_start..i]; - out.push_str(&maybe_rewrite_url(url, source_dir)); - } else { - // `]` (0x5D) is ASCII; UTF-8 continuation bytes are 0x80-0xBF - // and start bytes are 0xC0+, so byte-level scanning of `]` is - // safe. For non-ASCII bytes, copy the full codepoint at once. - let b = bytes[i]; - if b < 0x80 { - out.push(b as char); - i += 1; - } else { - let len = utf8_seq_len(b); - let end = (i + len).min(bytes.len()); - out.push_str(&line[i..end]); - i = end; - } - } - } -} - -/// True for `rust` / `rust,...` fences that don't already opt out of -/// doctest execution. We rewrite these to `rust,ignore` because the prose -/// snippets in docs/ are illustrative, not standalone-compilable. -fn is_rust_fence_needing_ignore(lang: &str) -> bool { - let mut parts = lang.split(',').map(|p| p.trim()); - let Some(first) = parts.next() else { - return false; - }; - if !first.eq_ignore_ascii_case("rust") { - return false; - } - for tag in parts { - let t = tag.to_ascii_lowercase(); - if t == "ignore" || t == "no_run" || t == "compile_fail" || t == "should_panic" { - return false; - } - } - true -} - -fn utf8_seq_len(lead: u8) -> usize { - // lead < 0xC0 covers ASCII and unexpected continuation bytes; treat both as - // single-byte to make progress. - if lead < 0xC0 { - 1 - } else if lead < 0xE0 { - 2 - } else if lead < 0xF0 { - 3 - } else { - 4 - } -} - -fn maybe_rewrite_url(url: &str, source_dir: &str) -> String { - if url.is_empty() { - return url.to_string(); - } - // Already absolute (scheme://, mailto:, ssh://, etc.), leave alone. - if has_scheme(url) { - return url.to_string(); - } - // Pure anchor, leave alone. - if url.starts_with('#') { - return url.to_string(); - } - // Split off optional anchor. - let (path, anchor) = match url.find('#') { - Some(p) => (&url[..p], &url[p..]), - None => (url, ""), - }; - // Only rewrite if the path looks like a markdown file. - if !path.ends_with(".md") { - return url.to_string(); - } - // Resolve relative to source_dir. - let combined = if source_dir.is_empty() { - path.to_string() - } else { - format!("{}/{}", source_dir, path) - }; - let normalised = normalise_path(&combined); - format!("{}/{}{}", GH_DOCS_BASE, normalised, anchor) -} - -fn has_scheme(url: &str) -> bool { - // RFC 3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) ":" - let mut chars = url.chars(); - let first = match chars.next() { - Some(c) => c, - None => return false, - }; - if !first.is_ascii_alphabetic() { - return false; - } - for c in chars { - if c == ':' { - return true; - } - if !(c.is_ascii_alphanumeric() || matches!(c, '+' | '-' | '.')) { - return false; - } - } - false -} - -fn normalise_path(path: &str) -> String { - let mut stack: Vec<&str> = Vec::new(); - for seg in path.split('/') { - match seg { - "" | "." => {} - ".." => { - stack.pop(); - } - other => stack.push(other), - } - } - stack.join("/") -} - fn emit_placeholder_and_warn(dist_dir: &Path) { // Create minimal placeholder files so compilation succeeds std::fs::create_dir_all(dist_dir).ok(); diff --git a/src/abstract_interp/path_domain.rs b/src/abstract_interp/path_domain.rs index d46aa3c3..a50e76a8 100644 --- a/src/abstract_interp/path_domain.rs +++ b/src/abstract_interp/path_domain.rs @@ -214,7 +214,7 @@ impl PathFact { /// Accepts either of two structural invariants: /// /// * `dotdot = No && absolute = No` — the relative-and-`..`-free - /// shape recognised by [`is_path_safe`]. Cannot escape to an + /// shape recognised by `is_path_safe`. Cannot escape to an /// attacker-controlled absolute location. /// * `dotdot = No && prefix_lock.is_some()` — a canonicalised path /// (typically `File.expand_path` / `realpath` / `fs::canonicalize`) @@ -866,7 +866,7 @@ pub fn is_structural_variant_ctor_for_lang(lang: crate::symbol::Lang, callee: &s /// [`crate::ssa::type_facts::peel_identity_suffix`]. Other languages do /// not (yet) have an equivalent grammar-driven recogniser; the rejection /// arm in their fixtures returns either an empty string literal (handled -/// by [`SsaOp::Const`] seeding) or `None`/`null`/`nil` (handled by the +/// by `SsaOp::Const` seeding) or `None`/`null`/`nil` (handled by the /// non-data-return skip). pub fn is_zero_arg_allocator_for_lang(lang: crate::symbol::Lang, _callee: &str) -> bool { // Currently a no-op for non-Rust languages: rejection-arm constructors diff --git a/src/ast.rs b/src/ast.rs index 6ce80413..05969e57 100644 --- a/src/ast.rs +++ b/src/ast.rs @@ -1,3 +1,24 @@ +//! Tree-sitter parsing and two-pass analysis for all supported languages. +//! +//! The core type is `ParsedSource`, a thin wrapper around a parsed tree-sitter +//! tree that carries the source bytes and language. Parsing reuses a thread-local +//! [`tree_sitter::Parser`] so each worker thread keeps one live parser instance. +//! +//! ## Two-pass pipeline +//! +//! **Pass 1** (`extract_summaries_from_file`): builds the CFG, lowers to SSA, +//! and extracts a [`crate::summary::FuncSummary`] per function. Summaries +//! describe boundary behaviour: which arguments flow to sinks, which sources +//! the function reads, what taint it strips, and what it returns. +//! +//! **Pass 2** (`run_rules_on_file`): reanalyses each file with the merged +//! [`crate::summary::GlobalSummaries`] from pass 1. The taint engine runs a +//! forward dataflow worklist over SSA, resolving cross-file calls via summaries. +//! +//! Parse timeouts are tracked per-thread via [`take_last_parse_timeout_ms`] +//! so callers can surface the event as an informational diagnostic instead +//! of silently skipping the file. + #![allow(clippy::only_used_in_recursion, clippy::type_complexity)] use crate::auth_analysis; @@ -39,7 +60,7 @@ thread_local! { } /// Consume and return the most recent parse-timeout event on this thread -/// (set by [`ParsedSource::try_new`]). Used to lift the event into a +/// (set by `ParsedSource::try_new`). Used to lift the event into a /// synthetic [`Diag`] carrying an [`crate::engine_notes::EngineNote::ParseTimeout`]. pub fn take_last_parse_timeout_ms() -> Option { LAST_PARSE_TIMEOUT_MS.with(|c| c.take()) @@ -647,7 +668,7 @@ fn build_taint_diag( } /// Resolve a file extension to a language slug (e.g. `"rust"`, -/// `"javascript"`). Public façade over [`lang_for_path`] for callers +/// `"javascript"`). Public façade over `lang_for_path` for callers /// that only need the slug, used by the debug API to look up /// per-language rule enablement without re-parsing the file. pub fn lang_slug_for_path(path: &Path) -> Option<&'static str> { @@ -3985,7 +4006,7 @@ pub struct FusedResult { /// /// When `global_summaries` is `None`, the taint engine runs with local /// context only (equivalent to pass 1 + partial pass 2). A second call -/// to [`run_taint_only`] can refine findings with the full cross-file view +/// to `run_taint_only` can refine findings with the full cross-file view /// without re-parsing or re-building the CFG. pub fn analyse_file_fused( bytes: &[u8], diff --git a/src/auth_analysis/extract/common.rs b/src/auth_analysis/extract/common.rs index bb61f0dd..27ac3fbf 100644 --- a/src/auth_analysis/extract/common.rs +++ b/src/auth_analysis/extract/common.rs @@ -2793,7 +2793,7 @@ fn function_params(node: Node<'_>, bytes: &[u8]) -> Vec { params } -/// Variant of [`function_params`] that always includes id-like typed +/// Variant of `function_params` that always includes id-like typed /// Python params (`dag_id: str`, `dag_run_id: str`). Used by /// `attach_route_handler` to populate `unit.params` for RouteHandler /// units so middleware-injected auth checks (FastAPI @@ -2802,7 +2802,7 @@ fn function_params(node: Node<'_>, bytes: &[u8]) -> Vec { /// the id-shaped ones that are *the* primary user-controlled data on /// REST routes. /// -/// The id-like filter in [`collect_param_names`] exists to keep +/// The id-like filter in `collect_param_names` exists to keep /// internal helper signatures (`def f(release_id: int, project: /// Project)`) from passing `unit_has_user_input_evidence`'s param /// heuristic, which would over-fire `missing_ownership_check`. Route diff --git a/src/auth_analysis/mod.rs b/src/auth_analysis/mod.rs index f8dcff39..d4595762 100644 --- a/src/auth_analysis/mod.rs +++ b/src/auth_analysis/mod.rs @@ -1,4 +1,60 @@ -#![doc = include_str!(concat!(env!("OUT_DIR"), "/auth_analysis.md"))] +//! Missing authorization and ownership checks (Rust-primary). +//! +//! Detects request handlers that reach a privileged operation taking a scoped +//! identifier (`*_id`, row reference, scoped resource) without a preceding +//! ownership or membership check. +//! +//! Other languages have rule scaffolding (`py.auth.*`, `js.auth.*`, +//! `rb.auth.*`, `go.auth.*`, `java.auth.*`) but only Rust has benchmark +//! corpus coverage and validated precision. Treat non-Rust findings as preview. +//! +//! # Rule IDs +//! +//! | Rule ID | Variant | +//! |---------|---------| +//! | `rs.auth.missing_ownership_check` | Standalone structural analyser (default on) | +//! | `rs.auth.missing_ownership_check.taint` | SSA/taint variant via `Cap::UNAUTHORIZED_ID` (default off) | +//! +//! Enable the taint variant via `scanner.enable_auth_as_taint = true` in +//! `nyx.conf`. Run both together when enabled; if both fire for the same site, +//! treat them as the same finding. +//! +//! # What counts as authorization +//! +//! The analyser accepts any of: +//! - A call to a recognised authorization helper (`check_ownership`, +//! `has_permission`, `require_*_member`, etc.; configurable per project). +//! - An ownership-equality check on a row reference +//! (`if owner_id != user.id { return 403 }`). +//! - A self-actor reference from a typed extractor param (`Extension`, +//! `CurrentUser`, etc.) combined with `user.id` / `user.user_id` use. +//! - A typed policy-guard wrapper (`GuardedData, _>`); +//! configured via `policy_guard_names`. +//! - A SQL query joining through an ACL table or filtering by `user_id` +//! predicate (detected without a SQL parser via [`sql_semantics`]). +//! - A helper-summary lift: a called function whose body contains a +//! `require_*_member` call (fixed-point up to 4 iterations). +//! +//! # Sink classification +//! +//! | Class | Examples | Treatment | +//! |-------|---------|-----------| +//! | `InMemoryLocal` | `map.insert`, `vec.push` on local | Never a sink | +//! | `RealtimePublish` | `realtime.publish_to_group` | Sink unless channel scope is ownership-checked | +//! | `OutboundNetwork` | `http.post`, `reqwest::Client::post` | Sink unless sanitizer is on the path | +//! | `CacheCrossTenant` | `redis.set` with scoped keys | Sink unless tenant is checked | +//! | `DbMutation` | `db.insert`, `repo.save` with scoped IDs | Sink unless ownership is established | +//! | `DbCrossTenantRead` | `db.query` returning tenant-scoped rows | Sink unless ACL-join or tenant predicate is present | +//! +//! # Submodules +//! +//! - [`checks`]: ownership-check recognition, actor-context extraction, +//! row-field variable tracking +//! - [`config`]: per-language auth rule defaults and config merging +//! - [`extract`]: handler detection, scoped-ID extraction, summary lifting +//! - [`model`]: `AnalysisUnit`, `AuthCheck`, `SensitiveOperation`, `SinkClass` +//! - [`sql_semantics`]: ACL-join and `user_id`-predicate detection without a +//! SQL parser pub mod checks; pub mod config; diff --git a/src/auth_analysis/model.rs b/src/auth_analysis/model.rs index 77113055..35ae3812 100644 --- a/src/auth_analysis/model.rs +++ b/src/auth_analysis/model.rs @@ -253,7 +253,7 @@ pub struct AnalysisUnit { /// Function parameter names whose static type maps to a /// payload-incompatible scalar ([`crate::ssa::type_facts::TypeKind::Int`] /// or [`crate::ssa::type_facts::TypeKind::Bool`]). Populated - /// per-file by [`super::apply_typed_bounded_params`] using the + /// per-file by `apply_typed_bounded_params` using the /// SSA-derived `VarTypes` map. Consulted by /// `is_typed_bounded_subject` so parameters like Spring `Long /// userId`, Axum `Path`, or FastAPI `user_id: int` are not @@ -265,7 +265,7 @@ pub struct AnalysisUnit { /// declared type is a payload-incompatible scalar. Map key is the /// parameter name (e.g. `dto`), value is the list of field names /// (e.g. `["age", "count"]`). Populated by - /// [`super::apply_typed_bounded_params`] only when the parameter + /// `apply_typed_bounded_params` only when the parameter /// itself was recognised as a typed extractor, bare parameters /// with no framework gate never lift their fields. pub typed_bounded_dto_fields: HashMap>, diff --git a/src/callgraph.rs b/src/callgraph.rs index 4b3f8710..b2ffde69 100644 --- a/src/callgraph.rs +++ b/src/callgraph.rs @@ -1,3 +1,15 @@ +//! Whole-program call graph built from pass-1 function summaries. +//! +//! Nodes are [`FuncKey`]s (one per function definition across all files). +//! Edges represent call-site relationships resolved after pass 1 completes. +//! Unresolved and ambiguous callees are tracked separately so they can be +//! surfaced in diagnostics without blocking analysis. +//! +//! [`CallGraphAnalysis`] computes SCCs and topological order. The scanner +//! uses topo order in pass 2 so callees are analysed before their callers, +//! and iterates over SCC groups to a fixed point for mutually recursive +//! functions. + use crate::interop::InteropEdge; use crate::rust_resolve::RustUseMap; use crate::summary::{CalleeQuery, CalleeResolution, GlobalSummaries}; @@ -55,7 +67,7 @@ pub struct CallGraph { pub struct CallGraphAnalysis { /// Strongly connected components. pub sccs: Vec>, - /// Maps each `NodeIndex` to its SCC index in [`sccs`]. + /// Maps each `NodeIndex` to its SCC index in `sccs`. #[allow(dead_code)] // used for future topo-ordered taint propagation pub node_to_scc: HashMap, /// SCC indices in **callee-first** (leaves-first) order. @@ -160,7 +172,7 @@ pub(crate) fn callee_container_hint(raw: &str) -> &str { /// Per-language `(container, method_name)` → candidate [`FuncKey`] index. /// /// Built once per call-graph construction over every merged -/// [`FuncSummary`]. Used by edge insertion to restrict an indirect method +/// [`crate::summary::FuncSummary`]. Used by edge insertion to restrict an indirect method /// call (`receiver.method(...)`) to only those targets whose defining /// container matches the receiver's static type. Without a container /// hint the index falls back to the bare-name list, matching today's @@ -272,7 +284,7 @@ impl ClassMethodIndex { /// /// Covers Java `extends`/`implements`, Rust `impl Trait for Type`, TS /// `extends`/`implements`, Python `class X(Base)`, plus PHP/Ruby/C++ -/// (see [`crate::cfg::hierarchy`]). Go's structural interfaces are +/// (see `crate::cfg::hierarchy`). Go's structural interfaces are /// intentionally omitted, name-only resolution is used instead. /// /// Container names are bare (no namespace), so cross-namespace aliases @@ -804,7 +816,7 @@ pub fn analyse(cg: &CallGraph) -> CallGraphAnalysis { /// such SCC has nodes in more than one file (`cross_file`). /// /// `has_mutual_recursion` triggers the SCC fixed-point loop in -/// [`crate::commands::scan::run_topo_batches`]. `cross_file` is a tighter +/// `run_topo_batches`. `cross_file` is a tighter /// signal used by joint fixed-point convergence: it implies the /// recursion involves at least one cross-file call edge, so the inline /// cache and per-iteration findings need joint convergence, not just diff --git a/src/cfg/mod.rs b/src/cfg/mod.rs index 59651c35..d5af03a6 100644 --- a/src/cfg/mod.rs +++ b/src/cfg/mod.rs @@ -1,3 +1,17 @@ +//! Intra-procedural control-flow graph construction. +//! +//! Walks tree-sitter ASTs for all ten supported languages and builds a +//! [`Cfg`] (a petgraph `DiGraph`) per function. +//! [`NodeInfo`] carries the statement kind, label classification, callee +//! name, taint and gate metadata. [`EdgeKind`] distinguishes normal flow, +//! true/false branches, and exception edges. +//! +//! `build_cfg` is the main entry point: given a parsed tree and language, +//! it produces a [`FileCfg`] (one [`Cfg`] per function in the file) along +//! with a [`FuncSummaries`] map for pass-1 summary extraction. +//! `export_summaries` converts in-graph [`LocalFuncSummary`] values to +//! the serializable [`crate::summary::FuncSummary`] form. + #![allow( clippy::collapsible_if, clippy::let_and_return, @@ -65,7 +79,7 @@ use params::{ is_configured_terminator, }; -/// Test-only re-export of [`extract_param_meta`] so the external +/// Test-only re-export of `extract_param_meta` so the external /// `tests/typed_extractors_audit.rs` harness can drive the per-param /// classifier directly without spinning up the full scan pipeline. /// Projects away the destructured-siblings third tuple slot so the @@ -675,7 +689,7 @@ pub struct FileCfg { /// per-file class / trait / interface hierarchy edges. /// Each entry is `(sub_container, super_container)` after /// language-specific normalisation. See - /// [`crate::cfg::hierarchy`] for the per-language extraction + /// `crate::cfg::hierarchy` for the per-language extraction /// rules and [`crate::callgraph::TypeHierarchyIndex`] for the /// downstream consumer. Empty for languages without an /// extractor (Go, C) and for files with no inheritance / impl diff --git a/src/cfg_analysis/mod.rs b/src/cfg_analysis/mod.rs index 54630d1f..5526f5fe 100644 --- a/src/cfg_analysis/mod.rs +++ b/src/cfg_analysis/mod.rs @@ -1,4 +1,50 @@ -#![doc = include_str!(concat!(env!("OUT_DIR"), "/cfg_analysis.md"))] +//! CFG structural analysis: dominator-based checks over intra-procedural CFGs. +//! +//! Checks structural properties that the taint engine cannot: whether sinks are +//! guarded by sanitizers or validators, whether web handlers reach privileged +//! sinks without an auth call, whether resources are released on all exit paths, +//! and whether error paths terminate before reaching dangerous code. +//! +//! A guard dominates a sink when the guard must execute before the sink on +//! every path from function entry. +//! +//! # Rule IDs +//! +//! | Rule ID | Severity | What it checks | +//! |---------|----------|----------------| +//! | `cfg-unguarded-sink` | High/Medium | Sink reachable from entry without a matching guard | +//! | `cfg-auth-gap` | High | Web handler reaches privileged sink with no auth call | +//! | `cfg-unreachable-sink` | Medium | Sink in dead code | +//! | `cfg-unreachable-sanitizer` | Low | Sanitizer in dead code (may have been silently disabled) | +//! | `cfg-unreachable-source` | Low | Source in dead code | +//! | `cfg-error-fallthrough` | High/Medium | Error path does not terminate before a dangerous call | +//! | `cfg-resource-leak` | Medium | Resource acquired but not released on all exit paths | +//! | `cfg-lock-not-released` | Medium | Lock acquired but not released on all exit paths | +//! +//! # Recognised guards +//! +//! `validate*`, `sanitize*`, `check_*`, `verify_*`, `assert_*`, +//! `shell_escape`, `html_escape`, `url_encode`, `which`. +//! +//! # Recognised auth names +//! +//! `is_authenticated`, `require_auth`, `check_permission`, `authorize`, +//! `authenticate`, `require_login`, `check_auth`, `verify_token`, +//! `validate_token` (cross-language), plus `isAuthenticated`, +//! `checkPermission`, `hasAuthority`, `hasRole` (Java) and +//! `middleware.auth`, `auth.required` (Go). +//! +//! Custom guards and auth functions can be added as `sanitizer` rules +//! with `cap = "all"` in `nyx.conf`. +//! +//! # Submodules +//! +//! - [`auth`]: auth-gap detection, handler classification +//! - [`dominators`]: dominator tree computation over CFG nodes +//! - [`error_handling`]: error-fallthrough detection +//! - [`guards`]: guard recognition and dominator queries +//! - [`resources`]: resource-leak and lock-not-released detection +//! - [`rules`]: finding construction and rule ID assignment pub mod auth; pub mod dominators; diff --git a/src/cli.rs b/src/cli.rs index b6a6ebb7..64f98b10 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -1,3 +1,11 @@ +//! Command-line interface definition via clap. +//! +//! Defines [`Cli`] (the top-level parser) and the [`Commands`] enum of +//! subcommands. Helpers on [`Commands`] answer routing questions the binary +//! needs without pattern-matching on specific arms: [`Commands::effective_format`], +//! [`Commands::is_structured_output`], [`Commands::is_serve`], and +//! [`Commands::is_informational`]. + use clap::{Parser, Subcommand, ValueEnum}; use serde::{Deserialize, Serialize}; @@ -250,7 +258,7 @@ pub enum Commands { #[arg(long, help_heading = "Output")] no_rank: bool, - /// Show inline-suppressed findings (dimmed, tagged [SUPPRESSED]) + /// Show inline-suppressed findings (dimmed, tagged \[SUPPRESSED\]) #[arg(long, help_heading = "Output")] show_suppressed: bool, diff --git a/src/commands/mod.rs b/src/commands/mod.rs index 18d50749..307deee0 100644 --- a/src/commands/mod.rs +++ b/src/commands/mod.rs @@ -1,3 +1,11 @@ +//! Subcommand handlers and top-level dispatch. +//! +//! [`handle_command`] is the single entry point from `main`. It installs +//! analysis engine options from the resolved config, then routes to the +//! appropriate subcommand module (scan, clean, config, index, list, serve). +//! CLI flags that override config values are applied per-arm before the +//! handler runs. + pub mod clean; pub mod config; pub mod index; diff --git a/src/commands/scan.rs b/src/commands/scan.rs index 470f8f31..30d5811f 100644 --- a/src/commands/scan.rs +++ b/src/commands/scan.rs @@ -117,10 +117,20 @@ fn fail_if_persist_errors(stage: &str, errors: Arc>>) -> NyxRe #[derive(Debug, Clone, serde::Serialize, serde::Deserialize)] pub struct Diag { + /// Project-relative path of the file containing the finding. pub path: String, + /// 1-based line number of the sink location. pub line: usize, + /// 0-based column offset of the sink location. pub col: usize, + /// Finding severity (Critical / High / Medium / Low / Info). pub severity: Severity, + /// Rule identifier, e.g. `taint-unsanitised-flow`, `cfg-auth-gap`, + /// `rs.auth.missing_ownership_check`. Taint findings append a + /// source-location suffix (`"taint-unsanitised-flow (source 12:3)"`) + /// so sibling paths with the same sink have distinct IDs for + /// deduplication; [`crate::evidence::Evidence::sink_caps`] disambiguates + /// findings at the same `(path, line, col)` that reach different sinks. pub id: String, /// High-level finding category (Security, Reliability, Quality). pub category: FindingCategory, @@ -871,7 +881,7 @@ static LAST_TOPO_NONRECURSIVE_REFINEMENTS: AtomicUsize = AtomicUsize::new(0); /// Returns the cumulative count of non-recursive batch refinements /// (summary + ssa-summary + body + auth inserts) persisted to -/// `global_summaries` during the most recent [`run_topo_batches`] call. +/// `global_summaries` during the most recent `run_topo_batches` call. /// Reset to zero at the start of each invocation. pub fn last_topo_nonrecursive_refinements() -> usize { LAST_TOPO_NONRECURSIVE_REFINEMENTS.load(Ordering::Relaxed) diff --git a/src/constraint/domain.rs b/src/constraint/domain.rs index 7fb6a937..d3451a45 100644 --- a/src/constraint/domain.rs +++ b/src/constraint/domain.rs @@ -322,7 +322,7 @@ impl BoolState { pub struct ValueFact { /// Exact known constant (Eq constraint). `None` = unconstrained. pub exact: Option, - /// Excluded constant values (Neq constraints). Bounded by [`MAX_NEQ`]. + /// Excluded constant values (Neq constraints). Bounded by `MAX_NEQ`. pub excluded: SmallVec<[ConstValue; 4]>, /// Inclusive lower bound (`None` = −∞). pub lo: Option, diff --git a/src/constraint/lower.rs b/src/constraint/lower.rs index d2cc0de3..d257bbc2 100644 --- a/src/constraint/lower.rs +++ b/src/constraint/lower.rs @@ -204,7 +204,7 @@ pub fn lower_condition( /// Called during SSA lowering when the full [`SsaBody`] is not yet available. /// Resolves variables via `var_stacks[name].last()` (the current reaching /// definition) instead of scanning `value_defs`. Does not use `const_values` -/// (unavailable at lowering time); constants are seeded into [`PathEnv`] +/// (unavailable at lowering time); constants are seeded into [`crate::constraint::PathEnv`] /// separately via `seed_from_optimization`. pub fn lower_condition_with_stacks( cond_info: &NodeInfo, diff --git a/src/constraint/solver.rs b/src/constraint/solver.rs index c8573a19..5cbcf6e8 100644 --- a/src/constraint/solver.rs +++ b/src/constraint/solver.rs @@ -200,7 +200,7 @@ fn apply_value_const(env: &mut PathEnv, v: crate::ssa::ir::SsaValue, op: CompOp, /// Resolution order: /// 1. Cross-language primitive aliases (case-insensitive) /// 2. Java/Ruby/Go class and framework names (case-sensitive) -/// 3. Java type hierarchy fallback (case-sensitive, via [`TypeHierarchy`]) +/// 3. Java type hierarchy fallback (case-sensitive, via [`crate::ssa::type_facts::TypeHierarchy`]) pub fn parse_type_name(name: &str) -> Option { use crate::ssa::type_facts::TypeHierarchy; diff --git a/src/convergence_telemetry.rs b/src/convergence_telemetry.rs index 6a6d46a7..b89d4ef9 100644 --- a/src/convergence_telemetry.rs +++ b/src/convergence_telemetry.rs @@ -29,7 +29,7 @@ pub enum ConvergenceEvent { /// Per-batch record for the SCC fix-point loop. /// /// Populated once per batch entry in -/// [`crate::commands::scan::run_topo_batches`] that hits the +/// `run_topo_batches` that hits the /// `has_mutual_recursion` branch. #[derive(Clone, Debug, Serialize, Deserialize)] pub struct SccBatchRecord { diff --git a/src/database.rs b/src/database.rs index 13253d97..fd7885c2 100644 --- a/src/database.rs +++ b/src/database.rs @@ -1,3 +1,14 @@ +//! SQLite connection pool and schema for the incremental index. +//! +//! The index stores file content hashes, per-file scan results, and function +//! summaries so subsequent scans can skip files whose content has not changed. +//! The pool is backed by [`r2d2`] with WAL journaling, `synchronous=NORMAL`, +//! and memory-mapped I/O tuned for large codebases. +//! +//! Tables: `files`, `issues`, `function_summaries`, `ssa_function_summaries`. +//! SSA-specific persistence lives in [`crate::summary::ssa_summary`]; routines +//! here cover function summaries and file-level hash bookkeeping. + pub mod index { #![allow(clippy::too_many_arguments, clippy::type_complexity)] @@ -615,7 +626,7 @@ pub mod index { }) } - /// Like [`should_scan`] but accepts a pre-computed hash to avoid + /// Like `should_scan` but accepts a pre-computed hash to avoid /// redundant file reads. pub fn should_scan_with_hash(&self, path: &Path, hash: &[u8]) -> NyxResult { let row: Option> = self @@ -673,7 +684,7 @@ pub mod index { /// (`file_id, rule_id, line, col`) to defend against upstream bugs /// that produce same-keyed diagnostics with differing severity or /// cosmetic fields. The first-seen row wins; upstream - /// [`crate::ast::ParsedSource::finalize_diags`] sorts so that high + /// `ParsedSource::finalize_diags` sorts so that high /// severity comes first, and this fallback preserves that ordering. pub fn replace_issues<'a>( &mut self, diff --git a/src/errors.rs b/src/errors.rs index e672080d..01fdedb9 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -1,3 +1,12 @@ +//! Error types used throughout the scanner. +//! +//! [`NyxError`] wraps I/O, TOML parse, SQLite, tree-sitter, and connection-pool +//! errors into a single enum. [`NyxResult`] is the standard return type alias. +//! +//! [`ConfigError`] and [`ConfigErrorKind`] carry structured config-validation +//! diagnostics (section, field, message, kind) so callers can format them +//! consistently without ad-hoc string matching. + use serde::Serialize; use serde::de::StdError; use std::fmt; diff --git a/src/evidence.rs b/src/evidence.rs index e7208e7c..d46785c3 100644 --- a/src/evidence.rs +++ b/src/evidence.rs @@ -60,10 +60,15 @@ impl FromStr for Confidence { #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum FlowStepKind { + /// A source read: user input, environment variable, network data, etc. Source, + /// A local assignment propagating taint from one variable to another. Assignment, + /// A function call through which taint flows (via argument or return value). Call, + /// An SSA phi node merging tainted values from multiple predecessors. Phi, + /// The dangerous sink where tainted data is consumed. Sink, } @@ -82,19 +87,29 @@ impl fmt::Display for FlowStepKind { /// A single step in a taint flow path (display-ready). #[derive(Debug, Clone, Serialize, Deserialize)] pub struct FlowStep { + /// 1-based position of this step in the flow (source = 1, sink = N). pub step: u32, pub kind: FlowStepKind, + /// Project-relative file path where this step occurs. pub file: String, + /// 1-based line number of the operation. pub line: u32, + /// 0-based column offset of the operation. pub col: u32, + /// Source code snippet at this location, if available. #[serde(default, skip_serializing_if = "Option::is_none")] pub snippet: Option, + /// SSA variable name carrying taint at this step. #[serde(default, skip_serializing_if = "Option::is_none")] pub variable: Option, + /// For [`FlowStepKind::Call`] steps, the name of the function called. #[serde(default, skip_serializing_if = "Option::is_none")] pub callee: Option, + /// Name of the enclosing function at this step. #[serde(default, skip_serializing_if = "Option::is_none")] pub function: Option, + /// True when this step crosses a file boundary, resolved via a cross-file + /// summary rather than direct SSA flow. #[serde(default, skip_serializing_if = "std::ops::Not::not")] pub is_cross_file: bool, } diff --git a/src/interop.rs b/src/interop.rs index ab2023ea..03ec863f 100644 --- a/src/interop.rs +++ b/src/interop.rs @@ -1,3 +1,13 @@ +//! Explicit cross-language call-graph bridge edges. +//! +//! Without an [`InteropEdge`], the call graph resolver never attempts +//! cross-language resolution. This prevents false positives from functions +//! in different languages that happen to share a name. +//! +//! An [`InteropEdge`] maps a [`CallSiteKey`] (caller language, file, function, +//! callee symbol, call ordinal) to a [`FuncKey`] in another language. Ordinal +//! `0` acts as a wildcard matching any call of that name from the given caller. + use crate::symbol::{FuncKey, Lang}; /// Identifies a specific call site within a caller function. diff --git a/src/labels/mod.rs b/src/labels/mod.rs index 9f5a378f..d94829c1 100644 --- a/src/labels/mod.rs +++ b/src/labels/mod.rs @@ -1,3 +1,16 @@ +//! Per-language source, sanitizer, and sink rule registries. +//! +//! The central type is [`DataLabel`], which pairs a [`Cap`] bitflag set with +//! a role (Source, Sanitizer, Sink). [`LabelRule`] maps AST text patterns to +//! labels. [`classify`] and [`classify_all`] look up a callee name against +//! the active language's rule table; [`classify_gated_sink`] handles +//! argument-role-aware sinks where one argument controls whether the call is +//! dangerous at all. +//! +//! Rules for each language live in per-language submodules (`rust`, `java`, +//! `go`, `python`, `php`, `ruby`, `javascript`, `typescript`, `c`, `cpp`). +//! The [`Cap`] bitflag type is defined here and shared with the taint engine. + mod c; mod cpp; mod go; @@ -125,19 +138,58 @@ pub struct SinkGate { } bitflags! { + /// Security capability bits for sources, sanitizers, and sinks. + /// + /// Each bit represents a security-relevant property. The meaning depends on + /// which role the [`Cap`] value is attached to: + /// + /// - **Source**: which attack classes this tainted value can potentially + /// trigger. Sources usually carry [`Cap::all()`] so they match any sink. + /// [`ENV_VAR`](Cap::ENV_VAR) is an exception — it marks origin rather + /// than reach. + /// - **Sanitizer**: which attack classes this function strips. A sanitizer + /// labelled with [`HTML_ESCAPE`](Cap::HTML_ESCAPE) clears the XSS-relevant + /// bits from tainted values that flow through it. + /// - **Sink**: which capability bits must be present on the incoming tainted + /// value for a finding to fire. A SQL sink requires [`SQL_QUERY`](Cap::SQL_QUERY). + /// + /// In practice: a finding fires when a tainted value reaches a sink and + /// `(value_caps & sink_caps) != 0`. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct Cap: u16 { + /// Taint that originated from an environment variable read. + /// Used as a source-origin marker for env-injection rules. const ENV_VAR = 0b0000_0000_0000_0001; // bit 0 + /// Sanitizer: the value has passed through HTML entity escaping. + /// Strips XSS risk from values that reach HTML output sinks. const HTML_ESCAPE = 0b0000_0000_0000_0010; // bit 1 + /// Sanitizer: the value has been shell-argument escaped. + /// Strips command-injection risk before shell sinks. const SHELL_ESCAPE = 0b0000_0000_0000_0100; // bit 2 + /// Sanitizer: the value has been percent-encoded for use in a URL. const URL_ENCODE = 0b0000_0000_0000_1000; // bit 3 + /// Sanitizer: the value was parsed through a structured JSON decoder + /// (as opposed to `eval`-based or regex parsing). const JSON_PARSE = 0b0000_0000_0001_0000; // bit 4 + /// Sink: file system read or write operation (path traversal, arbitrary + /// file read/write). const FILE_IO = 0b0000_0000_0010_0000; // bit 5 + /// Sink: format string injection (e.g. `printf`-family, `String.format`). const FMT_STRING = 0b0000_0000_0100_0000; // bit 6 + /// Sink: SQL query construction. Fires for string-concatenated queries + /// and parameterized-query builders where the query text itself is tainted. const SQL_QUERY = 0b0000_0000_1000_0000; // bit 7 + /// Sink: unsafe object deserialization (Java `ObjectInputStream`, + /// Python `pickle`, Ruby `Marshal`, PHP `unserialize`, etc.). const DESERIALIZE = 0b0000_0001_0000_0000; // bit 8 + /// Sink: server-side request forgery. Fires when attacker-controlled + /// data reaches the destination URL of an outbound HTTP request. const SSRF = 0b0000_0010_0000_0000; // bit 9 + /// Sink: code or command execution (shell injection, `eval`, `exec`, + /// dynamic `require`/`import`, template injection). const CODE_EXEC = 0b0000_0100_0000_0000; // bit 10 + /// Sink: cryptographic operation with a tainted algorithm name or seed + /// (weak-crypto / predictable-randomness patterns). const CRYPTO = 0b0000_1000_0000_0000; // bit 11 /// Request-bound, caller-supplied identifier that has not yet been /// validated against an ownership/membership check. Used as the @@ -747,7 +799,7 @@ fn phase_c_auth_rules_for_lang(lang_slug: &str) -> Vec { } } -/// Public re-export used by [`crate::ast::ParsedFile::from_source`] to +/// Public re-export used by `ParsedFile::from_source` to /// augment per-file rule sets when imports reveal frameworks that the /// manifest-level detector missed. pub fn framework_rules_for_lang_pub( @@ -1207,7 +1259,7 @@ pub fn classify_gated_sink( out } -/// Public wrapper for [`normalize_chained_call`] so callers outside the module +/// Public wrapper for `normalize_chained_call` so callers outside the module /// can share the same normalization used by the label classifier. pub fn normalize_chained_call_for_classify(text: &str) -> String { normalize_chained_call(text) diff --git a/src/lib.rs b/src/lib.rs index 39c486a1..93815af7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,14 +1,92 @@ -//! Multi-language static vulnerability scanner. Tree-sitter parsing, petgraph -//! CFGs, SSA-based dataflow, and cross-file taint analysis with a -//! capability-based sanitizer system. Supports Rust, C, C++, Java, Go, PHP, -//! Python, Ruby, TypeScript, and JavaScript. +//! Multi-language static vulnerability scanner. //! -//! The handbook below is embedded verbatim from -//! [`docs/how-it-works.md`](https://github.com/elicpeter/nyx/blob/master/docs/how-it-works.md). +//! Tree-sitter parsing, petgraph CFGs, SSA-based dataflow, and cross-file +//! taint analysis with a capability-based sanitizer system. Supports Rust, +//! C, C++, Java, Go, PHP, Python, Ruby, TypeScript, and JavaScript. +//! +//! This crate is both the `nyx` binary and a library for programmatic +//! scanning. Most internal modules are public for testing and downstream +//! tooling, but the stable contract is [`scan_no_index`] plus the types +//! it returns. +//! +//! For a description of how the analysis pipeline works, see the +//! [how-it-works handbook](https://github.com/elicpeter/nyx/blob/master/docs/how-it-works.md). //! Per-detector documentation lives on the [`taint`], [`cfg_analysis`], -//! [`state`], [`patterns`], and [`auth_analysis`] modules. The primary -//! library entry point for tests and embedders is [`scan_no_index`]. -#![doc = include_str!(concat!(env!("OUT_DIR"), "/lib_intro.md"))] +//! [`state`], [`patterns`], and [`auth_analysis`] module pages. +//! +//! # Entry points +//! +//! [`scan_no_index`] runs a full two-pass scan over a directory tree and +//! returns a flat list of [`commands::scan::Diag`] values. It does not +//! touch a SQLite index; every file is analysed from disk on each call. +//! +//! ```no_run +//! use nyx_scanner::{scan_no_index, utils::Config}; +//! use std::path::Path; +//! +//! let config = Config::default(); +//! let findings = scan_no_index(Path::new("/path/to/project"), &config).unwrap(); +//! for diag in &findings { +//! println!("{} at {}:{}", diag.id, diag.path, diag.line); +//! } +//! ``` +//! +//! For incremental rescanning backed by a SQLite index, use +//! [`commands::scan::scan_with_index_parallel`] directly. +//! +//! # Key types +//! +//! | Type | Purpose | +//! |------|---------| +//! | [`utils::config::Config`] | Top-level scanner config (load from `nyx.conf` or construct in code) | +//! | [`commands::scan::Diag`] | A single finding: location, severity, rule ID, structured evidence | +//! | [`evidence::Evidence`] | Source/sink spans, flow steps, sanitizer annotations, engine notes | +//! | [`evidence::Confidence`] | Low / Medium / High confidence tag | +//! | [`labels::Cap`] | Bitflag capability set describing what a taint flow can reach | +//! | [`symbol::Lang`] | Supported language enum | +//! | [`symbol::FuncKey`] | Canonical cross-file function identity | +//! +//! # Reading findings +//! +//! Each [`commands::scan::Diag`] carries: +//! +//! - `path`, `line`, `col` — source location of the sink +//! - `id` — rule identifier (e.g. `taint-unsanitised-flow`, `cfg-auth-gap`) +//! - `severity` — Critical / High / Medium / Low / Info +//! - `confidence` — Low / Medium / High; capped at Medium when an engine +//! budget was hit +//! - `rank_score` — deterministic attack-surface score for truncation ordering +//! - `evidence` — optional [`evidence::Evidence`] with source/sink spans, +//! flow steps, and [`engine_notes::EngineNote`] values describing precision loss +//! +//! Engine notes communicate when a bound was hit. A finding carrying +//! `EngineNote::OriginsTruncated` or `EngineNote::SccBudgetExhausted` is +//! still real, but the engine had less information than it would have had +//! without the cap. +//! +//! # Module map +//! +//! | Module | Role | +//! |--------|------| +//! | [`ast`] | Tree-sitter parsing and two-pass analysis dispatch | +//! | [`mod@cfg`] | CFG construction from ASTs | +//! | [`ssa`] | SSA lowering and optimization passes | +//! | [`taint`] | Forward SSA taint analysis | +//! | [`cfg_analysis`] | Structural CFG checks (auth gaps, resource leaks, error paths) | +//! | [`state`] | Resource lifecycle and state-machine analysis | +//! | [`patterns`] | Pattern-based AST checks | +//! | [`auth_analysis`] | Missing authorization / ownership checks | +//! | [`callgraph`] | Whole-program call graph and SCC analysis | +//! | [`summary`] | Per-function summaries for cross-file resolution | +//! | [`labels`] | Source, sanitizer, and sink rule registries per language | +//! | [`symex`] | Symbolic execution for witness generation and path feasibility | +//! | [`abstract_interp`] | Interval and string bounds propagation for sink suppression | +//! | [`constraint`] | Path constraint solving and infeasible-path pruning | +//! | [`evidence`] | Finding provenance and confidence types | +//! | [`suppress`] | Inline `nyx:ignore` directive handling | +//! | [`output`] | JSON and SARIF serialization | +//! | [`database`] | SQLite index pool and schema | +//! | [`walk`] | Filesystem traversal with batched delivery | pub mod abstract_interp; pub mod ast; @@ -48,8 +126,19 @@ use errors::NyxResult; use std::path::Path; use utils::config::Config; -/// Run a two-pass scan without index (filesystem only). -/// This is the primary entry point for integration tests. +/// Run a two-pass scan over `root` without an incremental index. +/// +/// Every file under `root` is analysed from disk on each call; no SQLite +/// state is read or written. The walker respects `.gitignore` files when +/// `cfg.scanner.read_vcsignore` is true (the default), skips hidden files +/// and symlinks unless the config enables them, and excludes the directories +/// and extensions listed in `cfg.scanner.excluded_*`. +/// +/// Returns one [`commands::scan::Diag`] per finding. The list is unsorted; +/// call [`rank::rank_diags`] if you need findings ordered by exploitability. +/// +/// For indexed / incremental rescanning use +/// [`commands::scan::scan_with_index_parallel`] instead. pub fn scan_no_index(root: &Path, cfg: &Config) -> NyxResult> { commands::scan::scan_filesystem(root, cfg, false) } diff --git a/src/output.rs b/src/output.rs index 73b5d5d2..24711f3e 100644 --- a/src/output.rs +++ b/src/output.rs @@ -1,3 +1,12 @@ +//! Finding serialization and output routing. +//! +//! Serializes [`crate::commands::scan::Diag`] values to console, JSON, or +//! SARIF based on the requested format. `PATTERN_DESCRIPTIONS` is a +//! lazily-built map from pattern ID to human-readable description, populated +//! from all language registries on first access. `sarif_base_id` normalizes +//! source-location-suffixed finding IDs (like `"taint-unsanitised-flow (source 12:3)"`) +//! to the canonical SARIF rule ID form. + use crate::commands::scan::Diag; use crate::patterns::{self, Severity}; use once_cell::sync::Lazy; diff --git a/src/patterns/mod.rs b/src/patterns/mod.rs index dc30f839..5777a419 100644 --- a/src/patterns/mod.rs +++ b/src/patterns/mod.rs @@ -1,4 +1,52 @@ -#![doc = include_str!(concat!(env!("OUT_DIR"), "/patterns.md"))] +//! AST pattern matching: tree-sitter queries over dangerous structural shapes. +//! +//! Patterns match constructs based on syntax alone, with no dataflow or CFG. +//! A match means the construct is present; it is not proof that it is +//! reachable or exploitable. Patterns run in every analysis mode and are the +//! only active detector in `--mode ast`. +//! +//! # Rule ID format +//! +//! ```text +//! .. +//! ``` +//! +//! Examples: `js.code_exec.eval`, `py.deser.pickle_loads`, `c.memory.gets`, +//! `java.sqli.execute_concat`. +//! +//! # Tiers +//! +//! - **Tier A**: structural presence alone is high-signal. `gets`, `eval`, +//! `pickle.loads`, `mem::transmute`. No guard needed. +//! - **Tier B**: pattern includes a tree-sitter heuristic guard. +//! `java.sqli.execute_concat` fires only when `executeQuery` receives a +//! `binary_expression` (concatenation), not a literal or parameterized call. +//! +//! # Categories +//! +//! | Category | Examples | +//! |----------|---------| +//! | `CommandExec` | `system`, `os.system`, `Runtime.exec`, backticks | +//! | `CodeExec` | `eval`, `Function`, PHP `assert("string")`, `class_eval` | +//! | `Deserialization` | `pickle.loads`, `yaml.load`, `Marshal.load`, `readObject` | +//! | `SqlInjection` | `executeQuery` with concatenated argument (Tier B) | +//! | `PathTraversal` | PHP `include $var` | +//! | `Xss` | `innerHTML`, `document.write`, `insertAdjacentHTML` | +//! | `Crypto` | `md5`, `sha1`, `Math.random` for security use | +//! | `Secrets` | Hardcoded API keys (Go, JS, TS) | +//! | `InsecureTransport` | `InsecureSkipVerify`, `fetch("http://...")` | +//! | `Reflection` | `Class.forName`, `Method.invoke`, `constantize` | +//! | `MemorySafety` | `transmute`, `unsafe`, `gets`, `strcpy`, `sprintf` | +//! | `Prototype` | `__proto__` assignment, `Object.prototype.*` | +//! | `Config` | CORS dynamic origin, `rejectUnauthorized: false` | +//! | `CodeQuality` | `unwrap`, `panic!`, `as any` | +//! +//! # Pattern loading +//! +//! Each language submodule exports a `patterns()` function returning +//! `&'static [Pattern]`. [`load`] dispatches to the correct submodule by +//! language slug. [`Pattern`] carries the rule ID, severity, confidence, +//! category, and the tree-sitter query string. pub mod c; pub mod cpp; diff --git a/src/pointer/analysis.rs b/src/pointer/analysis.rs index c0a6a743..3ce98c3c 100644 --- a/src/pointer/analysis.rs +++ b/src/pointer/analysis.rs @@ -43,7 +43,7 @@ fn is_container_read_callee(callee: &str) -> bool { ) } -/// Container-write callees, mirror of [`is_container_read_callee`]. +/// Container-write callees, mirror of `is_container_read_callee`. pub fn is_container_write_callee(callee: &str) -> bool { let bare = match callee.rsplit_once('.') { Some((_, m)) => m, @@ -66,7 +66,7 @@ pub fn is_container_write_callee(callee: &str) -> bool { ) } -/// Public re-export of [`is_container_read_callee`] for the taint engine. +/// Public re-export of `is_container_read_callee` for the taint engine. pub fn is_container_read_callee_pub(callee: &str) -> bool { is_container_read_callee(callee) } @@ -92,7 +92,7 @@ pub fn is_container_read_callee_pub(callee: &str) -> bool { /// /// Receiver (`SelfParam`) reads/writes are recorded under the /// [`u32::MAX`] sentinel parameter index, mirroring the convention in -/// [`crate::summary::ssa_summary::SsaFuncSummary::receiver_to_*`]. +/// `SsaFuncSummary::receiver_to_*` fields. /// /// The container-element sentinel field [`FieldId::ELEM`] is recorded /// under the special name `""` so callers can recognise the diff --git a/src/ssa/heap.rs b/src/ssa/heap.rs index 51e5bc53..9f022945 100644 --- a/src/ssa/heap.rs +++ b/src/ssa/heap.rs @@ -10,7 +10,7 @@ //! - PointsToSet is bounded to `analysis.engine.max_pointsto` entries //! (default 32, widening on overflow, see [`effective_max_pointsto`]). //! Overflow drops emit an [`crate::engine_notes::EngineNote::PointsToTruncated`] -//! note and increment [`POINTSTO_TRUNCATION_COUNT`] so operators can +//! note and increment `POINTSTO_TRUNCATION_COUNT` so operators can //! tell when the cap is firing on their corpus. //! - HeapState tracks per-(heap-object, slot) taint (monotone lattice) //! - HeapSlot::Index(u64) for constant-index container access (proven by const propagation) @@ -168,7 +168,7 @@ impl PointsToSet { /// /// Truncates to [`effective_max_pointsto`]; any heap-object member /// that would be admitted after the cap is reached is dropped and - /// counted via [`record_pointsto_truncation`]. Truncation is + /// counted via `record_pointsto_truncation`. Truncation is /// deterministic: the merge proceeds in sorted order, so survivors /// are always the smallest `HeapObjectId`s across the two inputs. pub fn union(&self, other: &Self) -> Self { @@ -230,7 +230,7 @@ impl PointsToSet { /// /// When the set is already at [`effective_max_pointsto`], the new id /// is dropped and the drop is counted via - /// [`record_pointsto_truncation`]. + /// `record_pointsto_truncation`. pub fn insert(&mut self, id: HeapObjectId) { match self.ids.binary_search(&id) { Ok(_) => {} // already present diff --git a/src/ssa/mod.rs b/src/ssa/mod.rs index 9e2f693e..b56c60cc 100644 --- a/src/ssa/mod.rs +++ b/src/ssa/mod.rs @@ -1,3 +1,21 @@ +//! SSA IR, lowering, and optimization passes. +//! +//! The pipeline converts a CFG into a pruned SSA body consumed by the taint +//! analysis engine. [`lower_to_ssa`] inserts phi nodes via Cytron's algorithm +//! and renames variables along the dominator tree. [`optimize_ssa`] runs +//! constant propagation, branch pruning, copy propagation, DCE, and type +//! fact analysis in sequence. +//! +//! Key submodules: +//! - [`ir`]: core types (`SsaValue`, `SsaOp`, `SsaInst`, `SsaBlock`, `SsaBody`) +//! - [`lower`]: CFG-to-SSA lowering with Cytron phi insertion and dominator-tree rename +//! - [`const_prop`]: sparse conditional constant propagation with branch pruning +//! - [`copy_prop`]: copy and alias propagation +//! - [`dce`]: dead definition elimination +//! - [`type_facts`]: per-value type inference (`TypeKind`, `TypeFactResult`) +//! - [`heap`]: abstract heap for container element abstractions +//! - [`alias`]: base-variable alias groups from copy propagation + #[allow(dead_code)] // IR types, fields used by Display impl, tests, and downstream analyses pub mod alias; pub mod const_prop; diff --git a/src/ssa/param_points_to.rs b/src/ssa/param_points_to.rs index ecaeef0e..180d96d8 100644 --- a/src/ssa/param_points_to.rs +++ b/src/ssa/param_points_to.rs @@ -25,7 +25,7 @@ //! //! The analysis is **flow-insensitive** and **bounded**: it does not //! reason about path feasibility, and it stops adding edges once the -//! summary's [`MAX_ALIAS_EDGES`] cap is reached, the overflow flag is +//! summary's `MAX_ALIAS_EDGES` cap is reached, the overflow flag is //! the conservative fallback that callers honour. use std::collections::{HashMap, HashSet}; @@ -239,7 +239,7 @@ fn returns_fresh_allocation( /// `formal_param_count` bounds the parameter indices written to the /// summary: scoped lowering synthesises `Param` ops for module-level /// captures at indices beyond the formal arity, and those must not leak -/// into the summary (they would trip [`crate::summary::ssa_summary_fits_arity`]). +/// into the summary (they would trip `ssa_summary_fits_arity`). pub fn analyse_param_points_to( ssa: &SsaBody, param_info: &[(usize, String, SsaValue)], diff --git a/src/state/mod.rs b/src/state/mod.rs index 101611c2..2bd93177 100644 --- a/src/state/mod.rs +++ b/src/state/mod.rs @@ -1,4 +1,55 @@ -#![doc = include_str!(concat!(env!("OUT_DIR"), "/state.md"))] +//! State-model analysis: resource lifecycle and authentication state tracking. +//! +//! Runs a per-function state machine over the CFG to detect use-after-close, +//! double-close, resource leaks, and unauthenticated access to privileged +//! operations. +//! +//! Enabled by default. Disable via `scanner.enable_state_analysis = false`. +//! Runs in `--mode full` and `--mode taint`; skipped in AST-only mode. +//! +//! # Rule IDs +//! +//! | Rule ID | Severity | What it detects | +//! |---------|----------|-----------------| +//! | `state-use-after-close` | High | Operation on a resource after it was closed | +//! | `state-double-close` | Medium | Resource closed twice | +//! | `state-resource-leak` | Medium | Resource opened and never closed on any path | +//! | `state-resource-leak-possible` | Low | Resource closed on some paths but not others | +//! | `state-unauthed-access` | High | Web handler reaches privileged sink without an auth call | +//! +//! # Managed-resource suppression +//! +//! Language-specific cleanup patterns suppress leak findings automatically: +//! +//! | Pattern | Languages | +//! |---------|-----------| +//! | RAII / Drop | Rust (all leak findings suppressed except `alloc`/`dealloc`) | +//! | Smart pointers (`make_unique`, `make_shared`) | C++ | +//! | `defer f.Close()` | Go | +//! | `with open(f) as f:` | Python | +//! | try-with-resources | Java | +//! +//! # Tracked acquire/release pairs +//! +//! C/C++: `fopen`/`fclose`, `open`/`close`, `socket`/`close`, +//! `malloc`/`free`, `pthread_mutex_lock`/`pthread_mutex_unlock`, +//! `new`/`delete`. +//! +//! Rust: `File::open`/`close`, `TcpStream::connect`/`shutdown`, +//! mutex `lock`/`read`/`write`/`drop`. +//! +//! Java: stream/connection/socket constructors / `close`, `getConnection`/`close`. +//! +//! Go, Python, JavaScript, Ruby, PHP follow language-idiomatic equivalents. +//! +//! # Submodules +//! +//! - [`domain`]: state lattice (`ResourceState`, `AuthState`, `StateCell`) +//! - [`engine`]: generic forward transfer engine (`Transfer` trait, `run_forward`) +//! - [`facts`]: per-node state fact extraction +//! - [`lattice`]: lattice join/meet for state values +//! - [`symbol`]: resource symbol normalisation +//! - [`transfer`]: `DefaultTransfer` — the concrete resource-lifecycle transfer function pub mod domain; pub mod engine; diff --git a/src/summary/mod.rs b/src/summary/mod.rs index 9fb123e8..00082a12 100644 --- a/src/summary/mod.rs +++ b/src/summary/mod.rs @@ -1,3 +1,20 @@ +//! Per-function summaries for cross-file taint analysis. +//! +//! [`FuncSummary`] describes a function's boundary behaviour: which parameters +//! flow to sinks, which sources it reads, whether it propagates taint from +//! arguments to its return value, and what capabilities it strips. Summaries +//! are serialized to SQLite in pass 1 and merged into [`GlobalSummaries`] +//! before pass 2 begins. +//! +//! [`crate::summary::ssa_summary::SsaFuncSummary`] is a richer summary +//! derived from the SSA taint engine and takes precedence over [`FuncSummary`] +//! during call resolution. `GlobalSummaries::ssa_by_key` stores SSA summaries +//! keyed by [`FuncKey`]; `GlobalSummaries::by_name` holds the fallback +//! name-keyed map for cases where an exact key is not found. +//! +//! Same-name collisions across files are merged conservatively: capabilities +//! are unioned and booleans are OR-ed so no true positive is silently dropped. + pub mod points_to; pub mod ssa_summary; @@ -669,7 +686,7 @@ impl GlobalSummaries { /// drop one of the two summaries entirely. /// /// We therefore inspect the existing entry first. If the new summary - /// is not [`summaries_compatible`] with it, we mint a synthetic + /// is not `summaries_compatible` with it, we mint a synthetic /// disambig (top bit set to stay disjoint from byte-offset disambigs) /// and retry the insert under the fresh key so *both* functions are /// preserved. @@ -1065,7 +1082,7 @@ impl GlobalSummaries { /// Snapshot the SSA summaries for convergence detection. /// - /// Used alongside [`snapshot_caps`] in the SCC fixed-point loop so that + /// Used alongside [`Self::snapshot_caps`] in the SCC fixed-point loop so that /// SSA-only refinements (e.g. a `StripBits` transform appearing after a /// cross-file sanitizer is resolved) are not invisible to convergence. pub fn snapshot_ssa(&self) -> &HashMap { @@ -1090,7 +1107,7 @@ impl GlobalSummaries { /// 2. Otherwise, for each wildcard prefix in scope, try /// `(wildcard_prefix, name)` in the module index. If across all /// wildcards exactly one arity-filtered candidate appears → resolved. - /// 3. Otherwise fall through to [`resolve_callee_key_with_container`] + /// 3. Otherwise fall through to [`Self::resolve_callee_key_with_container`] /// with no `container_hint`, meaning only the existing namespace / /// arity disambiguation applies. /// @@ -1168,9 +1185,9 @@ impl GlobalSummaries { /// Resolve a bare (already-normalized) callee name to a [`FuncKey`]. /// - /// Thin wrapper around [`resolve_callee`] that constructs a minimal + /// Thin wrapper around [`Self::resolve_callee`] that constructs a minimal /// [`CalleeQuery`] with no qualified hints. Kept for call sites that - /// only hold a string callee and an arity; prefer [`resolve_callee`] + /// only hold a string callee and an arity; prefer [`Self::resolve_callee`] /// whenever receiver / qualifier / container information is available. pub fn resolve_callee_key( &self, @@ -1197,7 +1214,7 @@ impl GlobalSummaries { /// unchanged. `container_hint` is interpreted as a syntactic /// container qualifier (not an authoritative receiver type), so a /// miss is allowed to fall through to leaf-name lookup. New - /// callers should route through [`resolve_callee`] and classify + /// callers should route through [`Self::resolve_callee`] and classify /// their hint as `receiver_type` vs `namespace_qualifier` vs /// `receiver_var` so the resolver can apply the correct policy. pub fn resolve_callee_key_with_container( diff --git a/src/summary/points_to.rs b/src/summary/points_to.rs index 4bf24223..04c56798 100644 --- a/src/summary/points_to.rs +++ b/src/summary/points_to.rs @@ -22,7 +22,7 @@ //! Mutation is observable to the caller through its argument for `j`. //! * `Source(Param(i)) → Target(Return)`, the return value aliases //! parameter `i`'s heap identity. Adds heap-level precision on top of -//! the coarser [`TaintTransform::Identity`] view already carried in +//! the coarser [`crate::summary::ssa_summary::TaintTransform::Identity`] view already carried in //! [`crate::summary::ssa_summary::SsaFuncSummary::param_to_return`]. //! //! `MustAlias` is intentionally omitted, the ROI on @@ -105,7 +105,7 @@ pub const MAX_ALIAS_EDGES: usize = 8; #[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)] pub struct PointsToSummary { /// Bounded edge list, deduped by `(source, target, kind)`. The - /// [`serde(default)`] attribute lets summaries pre-dating points-to + /// `#[serde(default)]` attribute lets summaries pre-dating points-to /// tracking deserialise cleanly (no edges). #[serde(default, skip_serializing_if = "SmallVec::is_empty")] pub edges: SmallVec<[AliasEdge; 4]>, @@ -193,7 +193,7 @@ impl PointsToSummary { } /// Parameter indices referenced by any edge in this summary. Used by - /// [`crate::summary::ssa_summary_fits_arity`] to confirm the summary + /// `ssa_summary_fits_arity` to confirm the summary /// does not reference a parameter beyond the key's declared arity /// (which would indicate a synthetic-param mis-attribution in /// extraction). diff --git a/src/summary/ssa_summary.rs b/src/summary/ssa_summary.rs index fa6b6cfc..007d87a6 100644 --- a/src/summary/ssa_summary.rs +++ b/src/summary/ssa_summary.rs @@ -165,7 +165,7 @@ pub struct SsaFuncSummary { /// [`crate::cfg::CallMeta::gate_filters`] carries more than one entry /// (e.g. `fetch` is both an `SSRF` gate on the URL arg and a /// `DATA_EXFIL` gate on the body arg), the multi-gate dispatch in - /// [`super::super::collect_block_events`] cap-narrows the event's + /// `collect_block_events` cap-narrows the event's /// `sink_caps` to the specific gate's `label_caps`. Each /// `(param_idx, label_caps)` entry records that this function's /// parameter `param_idx` flowed into a gated sink whose narrowed @@ -195,7 +195,7 @@ pub struct SsaFuncSummary { /// (e.g., function returns the same container it received as input). /// /// Populated by - /// [`crate::taint::ssa_transfer::summary_extract::extract_container_flow_summary`] + /// `extract_container_flow_summary` /// and applied at cross-file call sites to propagate the caller's /// points-to set for that argument onto the call's return SSA value. #[serde(default)] @@ -205,7 +205,7 @@ pub struct SsaFuncSummary { /// (e.g., `fn storeInto(value, arr) { arr.push(value); }` → `[(0, 1)]`). /// /// Populated by - /// [`crate::taint::ssa_transfer::summary_extract::extract_container_flow_summary`] + /// `extract_container_flow_summary` /// and applied at cross-file call sites by writing the caller's taint on /// the `src_param` argument into the heap objects pointed to by the /// `container_param` argument. @@ -254,7 +254,7 @@ pub struct SsaFuncSummary { /// Per-parameter return-path decomposition. /// /// When non-empty, supplies finer-grained per-path data than - /// [`Self::param_to_return`]. Each parameter maps to up to + /// `param_to_return`. Each parameter maps to up to /// [`MAX_RETURN_PATHS`] [`ReturnPathTransform`] entries, one per /// distinct path-predicate gate. Callers consult their own predicate /// state at the call site and apply only entries whose predicate is @@ -262,7 +262,7 @@ pub struct SsaFuncSummary { /// set into the effective call-site transform. /// /// Empty when the callee has a single return path, the aggregate - /// [`param_to_return`] is already precise, or when extraction + /// `param_to_return` is already precise, or when extraction /// could not derive per-return state (e.g. early-exit probes). #[serde(default, skip_serializing_if = "Vec::is_empty")] pub param_return_paths: Vec<(usize, SmallVec<[ReturnPathTransform; 2]>)>, @@ -338,7 +338,7 @@ pub struct SsaFuncSummary { /// control would not reach the post-call instruction. /// /// Populated by - /// [`crate::taint::ssa_transfer::summary_extract::extract_ssa_func_summary`] + /// `extract_ssa_func_summary` /// when a per-parameter probe shows the parameter's `var_name` in /// `validated_must` at every return block of the helper. Empty /// (the default) for helpers that do not validate any parameter. diff --git a/src/symbol/mod.rs b/src/symbol/mod.rs index b85b810d..2c447527 100644 --- a/src/symbol/mod.rs +++ b/src/symbol/mod.rs @@ -1,3 +1,15 @@ +//! Core language and function identity types. +//! +//! [`Lang`] is the 10-language enum (Rust, C, C++, Java, Go, PHP, Python, +//! Ruby, TypeScript, JavaScript). [`FuncKey`] is the canonical cross-file +//! function identity: name, arity, language, container (class/struct/module), +//! and an optional disambiguator for overloaded functions. +//! +//! [`FuncKey`] is the node type in the call graph and the lookup key in +//! [`crate::summary::GlobalSummaries`]. [`FuncKind`] distinguishes constructors, +//! methods, closures, and free functions so callers can apply language-specific +//! resolution heuristics. + use serde::{Deserialize, Serialize}; use std::fmt; diff --git a/src/symex/heap.rs b/src/symex/heap.rs index e95b95cf..f77e1dc0 100644 --- a/src/symex/heap.rs +++ b/src/symex/heap.rs @@ -102,7 +102,7 @@ pub struct FieldAccessRecord { /// Bounded symbolic heap tracking field-level symbolic values and taint. /// /// Cloned at fork points during multi-path exploration. Bounded -/// by [`MAX_HEAP_ENTRIES`] total entries and [`MAX_FIELDS_PER_OBJECT`] per +/// by `MAX_HEAP_ENTRIES` total entries and `MAX_FIELDS_PER_OBJECT` per /// object to prevent blowup on object-heavy code. #[derive(Clone, Debug)] pub struct SymbolicHeap { @@ -126,8 +126,8 @@ impl SymbolicHeap { /// Store a symbolic value into a heap field. /// - /// Bounded: silently drops the store if [`MAX_HEAP_ENTRIES`] or - /// [`MAX_FIELDS_PER_OBJECT`] would be exceeded. `Index(*)` entries are + /// Bounded: silently drops the store if `MAX_HEAP_ENTRIES` or + /// `MAX_FIELDS_PER_OBJECT` would be exceeded. `Index(*)` entries are /// bounded by [`MAX_TRACKED_INDICES`] per object; overflow collapses all /// indexed entries into `Elements`. pub fn store(&mut self, key: HeapKey, value: SymbolicValue, tainted: bool) { diff --git a/src/taint/backwards.rs b/src/taint/backwards.rs index 8294c0c1..5e2400de 100644 --- a/src/taint/backwards.rs +++ b/src/taint/backwards.rs @@ -149,7 +149,7 @@ pub struct BackwardsCtx<'a> { /// Language tag for source-kind heuristics (e.g. `os.getenv` hints). pub lang: Lang, /// Whole-program summaries: used to discover cross-file bodies and - /// [`SsaFuncSummary`] metadata at call instructions. + /// [`crate::summary::ssa_summary::SsaFuncSummary`] metadata at call instructions. pub global_summaries: Option<&'a GlobalSummaries>, /// Pre-lowered intra-file callee bodies keyed by [`FuncKey`]. Shared /// with the forward path so we do not lower functions twice. diff --git a/src/taint/mod.rs b/src/taint/mod.rs index 8e66afe2..ab63cf12 100644 --- a/src/taint/mod.rs +++ b/src/taint/mod.rs @@ -1,5 +1,78 @@ +//! Forward SSA taint analysis: the primary vulnerability detection engine. +//! +//! Tracks untrusted data from **sources** (where it enters the program) through +//! assignments and calls to **sinks** (where it is used dangerously). A finding +//! fires when the flow reaches a sink without passing a matching **sanitizer**. +//! +//! The engine is a monotone forward dataflow over a finite lattice with +//! guaranteed termination. It is flow-sensitive within a function and +//! interprocedural across files via persisted [`crate::summary::FuncSummary`] +//! and [`crate::summary::ssa_summary::SsaFuncSummary`] values. +//! +//! # Rule ID +//! +//! ```text +//! taint-unsanitised-flow (source :) +//! taint-data-exfiltration (source :) +//! ``` +//! +//! The source location is part of the ID so sibling paths to the same sink +//! get distinct IDs. Suppressions can target either the base ID or the full +//! string. +//! +//! # Capabilities +//! +//! Sources, sanitizers, and sinks are linked by [`crate::labels::Cap`] bits. +//! A sanitizer only clears the cap it declares; a sink only fires when the +//! remaining taint still carries its required cap. +//! +//! | Cap | Typical source | Typical sanitizer | Typical sink | +//! |-----|----------------|-------------------|--------------| +//! | `env_var` | `env::var`, `getenv`, `process.env` | | | +//! | `html_escape` | | `html.escape`, `DOMPurify.sanitize` | `innerHTML`, `document.write` | +//! | `shell_escape` | | `shlex.quote`, `shell_escape::escape` | `system`, `Command::new` | +//! | `url_encode` | | `encodeURIComponent` | HTTP client URL arg | +//! | `file_io` | | `realpath`, `filepath.Clean` | `open`, `fs::read_to_string` | +//! | `sql_query` | | parameterized query binders | `cursor.execute`, `db.query` | +//! | `deserialize` | | | `pickle.loads`, `Marshal.load` | +//! | `ssrf` | | URL-prefix locks | `fetch` URL arg, outbound HTTP | +//! | `code_exec` | | | `eval`, `exec`, `system` | +//! | `crypto` | | | weak-algorithm constructors | +//! | `data_exfil` | cookies, headers, env, db rows (Sensitive tier) | | `fetch` body/json/headers | +//! +//! Sources typically carry `Cap::all()` so they match any sink. +//! +//! # Source sensitivity +//! +//! Each source carries a [`crate::labels::SourceKind`] and a derived tier: +//! +//! - `Plain` — direct attacker input (`UserInput`): request bodies, query +//! strings, argv, stdin. +//! - `Sensitive` — operator-bound state: cookies, headers, env, files, DB rows, +//! caught exceptions. +//! +//! `Cap::DATA_EXFIL` only fires on `Sensitive`-tier sources. Plain user input +//! flowing into an outbound request body is suppressed — the canonical false +//! positive for API gateways that proxy `req.body`. +//! +//! # Confidence signals +//! +//! Higher confidence: source and sink both present in evidence, `source_kind: +//! user_input`, `path_validated: false`, symbolic witness produced. +//! +//! Lower confidence: path-validated taint, source is a database read or +//! internal file, engine note `ForwardBailed` / `PathWidened`. +//! +//! # Submodules +//! +//! - [`domain`]: taint lattice types (`VarTaint`, `TaintOrigin`, `SmallBitSet`, +//! `PredicateSummary`) +//! - [`ssa_transfer`]: SSA taint transfer functions and the forward worklist +//! (`SsaTaintState`, `SsaTaintTransfer`, `run_ssa_taint`) +//! - [`path_state`]: predicate classification for branch-sensitive propagation +//! - [`backwards`]: demand-driven backwards walk from sinks (off by default) + #![allow(clippy::collapsible_if, clippy::too_many_arguments)] -#![doc = include_str!(concat!(env!("OUT_DIR"), "/taint.md"))] pub mod backwards; pub mod domain; diff --git a/src/taint/path_state.rs b/src/taint/path_state.rs index 37f62260..87ead833 100644 --- a/src/taint/path_state.rs +++ b/src/taint/path_state.rs @@ -32,7 +32,7 @@ pub enum PredicateKind { ShellMetaValidated, /// Bounded-length rejection: `x.len() > N` / `x.length < N` with N >= 2. /// - /// Commonly paired with [`ShellMetaValidated`] in OR-chain rejection + /// Commonly paired with `ShellMetaValidated` in OR-chain rejection /// idioms (`if x.len() > MAX || x.contains(";") { reject }`). Counts as /// a dominator guard for `cfg-unguarded-sink` purposes, but intentionally /// does **not** mark variables as validated, the rejection direction is diff --git a/src/taint/ssa_transfer/mod.rs b/src/taint/ssa_transfer/mod.rs index a4b3d7e9..969687c9 100644 --- a/src/taint/ssa_transfer/mod.rs +++ b/src/taint/ssa_transfer/mod.rs @@ -71,14 +71,14 @@ pub struct SsaTaintTransfer<'a> { /// The [`BodyId`] of the body currently being analysed. Used as the /// owning scope when writing seed entries that leave this body /// (e.g. [`extract_ssa_exit_state`]) and as the identity recorded on - /// engine notes. Defaults to [`BodyId(0)`] (top-level) for inline + /// engine notes. Defaults to `BodyId(0)` (top-level) for inline /// probes and unit tests that analyse a single synthetic body. pub owner_body_id: BodyId, /// The [`BodyId`] of this body's lexical parent, if any. Drives the /// `Param`-op reader's lookup into [`Self::global_seed`]: we read /// from the parent's scope first (the seed entries produced by /// [`extract_ssa_exit_state`] on the parent body), then fall back to - /// [`BodyId(0)`] to pick up JS/TS two-level re-keyed entries (see + /// `BodyId(0)` to pick up JS/TS two-level re-keyed entries (see /// [`filter_seed_to_toplevel`]). `None` for the top-level body and /// for probes with no surrounding scope. pub parent_body_id: Option, @@ -176,7 +176,7 @@ pub struct SsaTaintTransfer<'a> { /// to detect handler-style flows that have no registered caller. pub auto_seed_handler_params: bool, /// Cross-file callee bodies sourced from - /// [`GlobalSummaries::bodies_iter`]. Populated in pass 2 to enable + /// [`GlobalSummaries`]. Populated in pass 2 to enable /// context-sensitive inline re-analysis across file boundaries the /// same way `callee_bodies` enables it intra-file. `None` preserves /// non-cross-file behaviour for unit tests and non-cross-file diff --git a/src/taint/ssa_transfer/state.rs b/src/taint/ssa_transfer/state.rs index 1d0f5350..0670ad10 100644 --- a/src/taint/ssa_transfer/state.rs +++ b/src/taint/ssa_transfer/state.rs @@ -366,7 +366,7 @@ pub struct SsaTaintState { /// = false`). pub abstract_state: Option, /// per-heap-field taint cells, keyed by - /// `(parent_loc, field)`. Sorted by [`FieldTaintKey`] for O(n) + /// `(parent_loc, field)`. Sorted by `FieldTaintKey` for O(n) /// merge-join. Populated only when the body's /// [`crate::pointer::PointsToFacts`] is available /// (`NYX_POINTER_ANALYSIS=1`); empty otherwise so the lattice join @@ -375,7 +375,7 @@ pub struct SsaTaintState { /// them. Cross-call propagation lands during lowering via the /// field-granularity `PointsToSummary`. /// - /// Cell shape: [`FieldCell`] carries `taint` plus + /// Cell shape: `FieldCell` carries `taint` plus /// `validated_must` / `validated_may` flags so validation flows /// through abstract field / element identity. pub field_taint: SmallVec<[(FieldTaintKey, FieldCell); 4]>, @@ -405,7 +405,7 @@ impl SsaTaintState { /// read the field cell at `key`. Returns `None` /// when no cell has been recorded (caller should treat as - /// untainted). O(log n) on the sorted [`field_taint`] list. + /// untainted). O(log n) on the sorted `field_taint` list. pub fn get_field(&self, key: FieldTaintKey) -> Option<&FieldCell> { self.field_taint .binary_search_by_key(&key, |(k, _)| *k) diff --git a/src/taint/ssa_transfer/summary_extract.rs b/src/taint/ssa_transfer/summary_extract.rs index 2a995d57..592b3821 100644 --- a/src/taint/ssa_transfer/summary_extract.rs +++ b/src/taint/ssa_transfer/summary_extract.rs @@ -33,7 +33,7 @@ const MAX_PROBE_PARAMS: usize = 8; /// Extract a precise per-parameter `SsaFuncSummary` from an already-lowered SSA body. /// -/// For each parameter (up to [`MAX_PROBE_PARAMS`]), runs a taint probe by seeding +/// For each parameter (up to `MAX_PROBE_PARAMS`), runs a taint probe by seeding /// that parameter with `Cap::all()` via `global_seed` and observing what caps /// survive to return positions and which sinks fire. A final probe with no params /// tainted detects intrinsic source caps. diff --git a/src/utils/config.rs b/src/utils/config.rs index 1709a140..e712144b 100644 --- a/src/utils/config.rs +++ b/src/utils/config.rs @@ -713,6 +713,22 @@ fn builtin_profile(name: &str) -> Option { }) } +/// Top-level scanner configuration. +/// +/// Loaded from `nyx.conf` (TOML) via [`Config::load`], or constructed in +/// code for embedded use. [`Config::default`] gives conservative defaults: +/// no symlink following, no hidden files, gitignore respected, 10 s parse +/// timeout, all analysis passes on. +/// +/// Config sections mirror `nyx.conf` sections: +/// - [`scanner`](Config::scanner): what files to scan, which analysis passes +/// to enable, severity floor +/// - [`output`](Config::output): format, ranking, LOW-finding budgets +/// - [`analysis`](Config::analysis): per-language rules, engine-pass toggles +/// - [`performance`](Config::performance): thread count, depth limit, batch +/// size +/// - [`database`](Config::database): incremental index settings +/// - [`detectors`](Config::detectors): per-detector sensitivity knobs #[derive(Debug, Serialize, Deserialize, Clone)] #[serde(default)] #[derive(Default)] diff --git a/src/utils/mod.rs b/src/utils/mod.rs index d1c7396a..0fe53e91 100644 --- a/src/utils/mod.rs +++ b/src/utils/mod.rs @@ -1,3 +1,16 @@ +//! Shared utilities and configuration. +//! +//! Re-exports [`Config`], [`AnalysisOptions`], and [`DetectorOptions`] from +//! their submodules. [`Config`] is loaded from `nyx.conf` and passed through +//! the top-level call stack. [`AnalysisOptions`] is installed once per process +//! via an `OnceLock` and read back via [`analysis_options::current`] from deep +//! inside the analysis pipeline without threading it through every call frame. +//! +//! Other submodules: `path` (root-relative path utilities and traversal guards), +//! `project` (framework detection, project metadata), `query_cache` (cached +//! tree-sitter query compilation), `snippet` (source snippet extraction for +//! finding locations). + pub mod analysis_options; pub mod config; pub mod detector_options; diff --git a/src/walk.rs b/src/walk.rs index 209692ad..abcbde7b 100644 --- a/src/walk.rs +++ b/src/walk.rs @@ -1,3 +1,13 @@ +//! Filesystem walker with batched path delivery. +//! +//! Builds an [`ignore`]-crate [`WalkBuilder`] from the config (respecting +//! `.gitignore`, excluded directories, and excluded extensions), then delivers +//! discovered paths to the analysis pipeline in batches over a crossbeam channel. +//! Batching amortizes channel overhead for large trees. +//! +//! All paths are checked via [`crate::utils::path::path_stays_within_root`] +//! before entering a batch, preventing traversal outside the scan root. + use crate::utils::Config; use crate::utils::path::path_stays_within_root; use crossbeam_channel::{Receiver, Sender, bounded};