docs: Enhance module documentation across various files for clarity a… (#62)

* docs: Enhance module documentation across various files for clarity and completeness

* fix: Remove unnecessary blank line in build.rs for cleaner code

* docs: Update documentation to improve clarity and consistency in code comments
This commit is contained in:
Eli Peter 2026-05-02 17:46:45 -04:00 committed by GitHub
parent 40995e45e7
commit 1f2bfe76c1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
44 changed files with 721 additions and 366 deletions

View file

@ -153,6 +153,22 @@ jobs:
exit 1
fi
rustdoc:
name: rustdoc
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
- uses: actions-rust-lang/setup-rust-toolchain@v1
with:
toolchain: stable
cache: true
- name: Check rustdoc links
env:
RUSTDOCFLAGS: "-D warnings"
run: cargo doc --workspace --no-deps --all-features
rust-beta-build:
name: rust-beta-build
runs-on: ubuntu-latest

291
build.rs
View file

@ -1,9 +1,7 @@
use std::path::{Path, PathBuf};
use std::path::Path;
use std::process::Command;
fn main() {
render_docs_for_rustdoc();
// Only relevant when the serve feature is active
if std::env::var("CARGO_FEATURE_SERVE").is_err() {
return;
@ -58,293 +56,6 @@ fn main() {
}
}
// ---------------------------------------------------------------------------
// Rustdoc / docs.rs: render docs/*.md into $OUT_DIR with relative .md links
// rewritten to absolute github.com/elicpeter/nyx URLs so they resolve when the
// markdown is embedded in rustdoc via #![doc = include_str!(...)].
//
// Source of truth stays in docs/. Files that don't exist (published-crate
// builds where docs/ wasn't packaged) fall back to a one-line stub so rustdoc
// still compiles.
// ---------------------------------------------------------------------------
const GH_DOCS_BASE: &str = "https://github.com/elicpeter/nyx/blob/master/docs";
struct DocSpec {
/// Path under docs/, e.g. "how-it-works.md" or "detectors/taint.md".
src: &'static str,
/// Output filename in $OUT_DIR.
out: &'static str,
}
const DOC_SPECS: &[DocSpec] = &[
DocSpec {
src: "how-it-works.md",
out: "lib_intro.md",
},
DocSpec {
src: "detectors/taint.md",
out: "taint.md",
},
DocSpec {
src: "detectors/cfg.md",
out: "cfg_analysis.md",
},
DocSpec {
src: "detectors/state.md",
out: "state.md",
},
DocSpec {
src: "detectors/patterns.md",
out: "patterns.md",
},
DocSpec {
src: "auth.md",
out: "auth_analysis.md",
},
];
fn render_docs_for_rustdoc() {
let Ok(out_dir) = std::env::var("OUT_DIR") else {
return;
};
let out_dir = PathBuf::from(out_dir);
let docs_dir = Path::new("docs");
for spec in DOC_SPECS {
let src_path = docs_dir.join(spec.src);
println!("cargo:rerun-if-changed=docs/{}", spec.src);
let out_path = out_dir.join(spec.out);
let rendered = match std::fs::read_to_string(&src_path) {
Ok(raw) => rewrite_doc_links(&raw, spec.src),
Err(_) => format!(
"See [`{base}/{src}`]({base}/{src}).\n",
base = GH_DOCS_BASE,
src = spec.src,
),
};
if let Err(e) = std::fs::write(&out_path, rendered) {
println!(
"cargo:warning=failed to write rendered doc {}: {}",
out_path.display(),
e
);
}
}
}
/// Render markdown for embedding in rustdoc.
///
/// 1. Rewrites relative `.md` links to absolute github.com URLs:
/// - inline links: `](path.md)` and `](path.md#anchor)`
/// - reference defs: `[id]: path.md`
/// 2. Labels unmarked fenced code blocks as `text` so rustdoc does not try
/// to compile them as Rust (and choke on Unicode like `→`).
/// 3. Annotates `rust` fences with `,ignore` so rustdoc doesn't try to
/// compile or run prose-level snippets as doctests. GitHub still
/// highlights them as Rust because it keys off the first token.
///
/// Skips link rewriting inside code fences. Skips link rewriting for URLs
/// that are already absolute (have a scheme), pure anchors (`#section`),
/// or non-`.md` paths.
fn rewrite_doc_links(content: &str, source_rel: &str) -> String {
let source_dir = Path::new(source_rel)
.parent()
.map(|p| p.to_string_lossy().into_owned())
.unwrap_or_default();
let mut out = String::with_capacity(content.len() + 256);
let mut in_fence = false;
for line in content.split_inclusive('\n') {
let body = line.strip_suffix('\n').unwrap_or(line);
let trimmed = body.trim_start();
if trimmed.starts_with("```") {
let lang = trimmed.trim_start_matches('`').trim();
if in_fence {
in_fence = false;
out.push_str(line);
} else {
in_fence = true;
let indent_len = body.len() - trimmed.len();
if lang.is_empty() {
out.push_str(&body[..indent_len]);
out.push_str("```text");
if line.ends_with('\n') {
out.push('\n');
}
} else if is_rust_fence_needing_ignore(lang) {
out.push_str(&body[..indent_len]);
out.push_str("```rust,ignore");
if line.ends_with('\n') {
out.push('\n');
}
} else {
out.push_str(line);
}
}
continue;
}
if in_fence {
out.push_str(line);
} else {
rewrite_links_in_line(body, &source_dir, &mut out);
if line.ends_with('\n') {
out.push('\n');
}
}
}
out
}
fn rewrite_links_in_line(line: &str, source_dir: &str, out: &mut String) {
let bytes = line.as_bytes();
let mut i = 0;
while i < bytes.len() {
// Inline link: `](URL)`, markdown URLs do not contain a raw `)`.
if i + 1 < bytes.len() && bytes[i] == b']' && bytes[i + 1] == b'(' {
out.push_str("](");
i += 2;
let url_start = i;
while i < bytes.len() && bytes[i] != b')' {
i += 1;
}
let url = &line[url_start..i];
out.push_str(&maybe_rewrite_url(url, source_dir));
}
// Reference def: `]: URL`.
else if i + 2 < bytes.len()
&& bytes[i] == b']'
&& bytes[i + 1] == b':'
&& bytes[i + 2] == b' '
{
out.push_str("]: ");
i += 3;
let url_start = i;
while i < bytes.len() && bytes[i] != b' ' {
i += 1;
}
let url = &line[url_start..i];
out.push_str(&maybe_rewrite_url(url, source_dir));
} else {
// `]` (0x5D) is ASCII; UTF-8 continuation bytes are 0x80-0xBF
// and start bytes are 0xC0+, so byte-level scanning of `]` is
// safe. For non-ASCII bytes, copy the full codepoint at once.
let b = bytes[i];
if b < 0x80 {
out.push(b as char);
i += 1;
} else {
let len = utf8_seq_len(b);
let end = (i + len).min(bytes.len());
out.push_str(&line[i..end]);
i = end;
}
}
}
}
/// True for `rust` / `rust,...` fences that don't already opt out of
/// doctest execution. We rewrite these to `rust,ignore` because the prose
/// snippets in docs/ are illustrative, not standalone-compilable.
fn is_rust_fence_needing_ignore(lang: &str) -> bool {
let mut parts = lang.split(',').map(|p| p.trim());
let Some(first) = parts.next() else {
return false;
};
if !first.eq_ignore_ascii_case("rust") {
return false;
}
for tag in parts {
let t = tag.to_ascii_lowercase();
if t == "ignore" || t == "no_run" || t == "compile_fail" || t == "should_panic" {
return false;
}
}
true
}
fn utf8_seq_len(lead: u8) -> usize {
// lead < 0xC0 covers ASCII and unexpected continuation bytes; treat both as
// single-byte to make progress.
if lead < 0xC0 {
1
} else if lead < 0xE0 {
2
} else if lead < 0xF0 {
3
} else {
4
}
}
fn maybe_rewrite_url(url: &str, source_dir: &str) -> String {
if url.is_empty() {
return url.to_string();
}
// Already absolute (scheme://, mailto:, ssh://, etc.), leave alone.
if has_scheme(url) {
return url.to_string();
}
// Pure anchor, leave alone.
if url.starts_with('#') {
return url.to_string();
}
// Split off optional anchor.
let (path, anchor) = match url.find('#') {
Some(p) => (&url[..p], &url[p..]),
None => (url, ""),
};
// Only rewrite if the path looks like a markdown file.
if !path.ends_with(".md") {
return url.to_string();
}
// Resolve relative to source_dir.
let combined = if source_dir.is_empty() {
path.to_string()
} else {
format!("{}/{}", source_dir, path)
};
let normalised = normalise_path(&combined);
format!("{}/{}{}", GH_DOCS_BASE, normalised, anchor)
}
fn has_scheme(url: &str) -> bool {
// RFC 3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) ":"
let mut chars = url.chars();
let first = match chars.next() {
Some(c) => c,
None => return false,
};
if !first.is_ascii_alphabetic() {
return false;
}
for c in chars {
if c == ':' {
return true;
}
if !(c.is_ascii_alphanumeric() || matches!(c, '+' | '-' | '.')) {
return false;
}
}
false
}
fn normalise_path(path: &str) -> String {
let mut stack: Vec<&str> = Vec::new();
for seg in path.split('/') {
match seg {
"" | "." => {}
".." => {
stack.pop();
}
other => stack.push(other),
}
}
stack.join("/")
}
fn emit_placeholder_and_warn(dist_dir: &Path) {
// Create minimal placeholder files so compilation succeeds
std::fs::create_dir_all(dist_dir).ok();

View file

@ -214,7 +214,7 @@ impl PathFact {
/// Accepts either of two structural invariants:
///
/// * `dotdot = No && absolute = No` — the relative-and-`..`-free
/// shape recognised by [`is_path_safe`]. Cannot escape to an
/// shape recognised by `is_path_safe`. Cannot escape to an
/// attacker-controlled absolute location.
/// * `dotdot = No && prefix_lock.is_some()` — a canonicalised path
/// (typically `File.expand_path` / `realpath` / `fs::canonicalize`)
@ -866,7 +866,7 @@ pub fn is_structural_variant_ctor_for_lang(lang: crate::symbol::Lang, callee: &s
/// [`crate::ssa::type_facts::peel_identity_suffix`]. Other languages do
/// not (yet) have an equivalent grammar-driven recogniser; the rejection
/// arm in their fixtures returns either an empty string literal (handled
/// by [`SsaOp::Const`] seeding) or `None`/`null`/`nil` (handled by the
/// by `SsaOp::Const` seeding) or `None`/`null`/`nil` (handled by the
/// non-data-return skip).
pub fn is_zero_arg_allocator_for_lang(lang: crate::symbol::Lang, _callee: &str) -> bool {
// Currently a no-op for non-Rust languages: rejection-arm constructors

View file

@ -1,3 +1,24 @@
//! Tree-sitter parsing and two-pass analysis for all supported languages.
//!
//! The core type is `ParsedSource`, a thin wrapper around a parsed tree-sitter
//! tree that carries the source bytes and language. Parsing reuses a thread-local
//! [`tree_sitter::Parser`] so each worker thread keeps one live parser instance.
//!
//! ## Two-pass pipeline
//!
//! **Pass 1** (`extract_summaries_from_file`): builds the CFG, lowers to SSA,
//! and extracts a [`crate::summary::FuncSummary`] per function. Summaries
//! describe boundary behaviour: which arguments flow to sinks, which sources
//! the function reads, what taint it strips, and what it returns.
//!
//! **Pass 2** (`run_rules_on_file`): reanalyses each file with the merged
//! [`crate::summary::GlobalSummaries`] from pass 1. The taint engine runs a
//! forward dataflow worklist over SSA, resolving cross-file calls via summaries.
//!
//! Parse timeouts are tracked per-thread via [`take_last_parse_timeout_ms`]
//! so callers can surface the event as an informational diagnostic instead
//! of silently skipping the file.
#![allow(clippy::only_used_in_recursion, clippy::type_complexity)]
use crate::auth_analysis;
@ -39,7 +60,7 @@ thread_local! {
}
/// Consume and return the most recent parse-timeout event on this thread
/// (set by [`ParsedSource::try_new`]). Used to lift the event into a
/// (set by `ParsedSource::try_new`). Used to lift the event into a
/// synthetic [`Diag`] carrying an [`crate::engine_notes::EngineNote::ParseTimeout`].
pub fn take_last_parse_timeout_ms() -> Option<u64> {
LAST_PARSE_TIMEOUT_MS.with(|c| c.take())
@ -647,7 +668,7 @@ fn build_taint_diag(
}
/// Resolve a file extension to a language slug (e.g. `"rust"`,
/// `"javascript"`). Public façade over [`lang_for_path`] for callers
/// `"javascript"`). Public façade over `lang_for_path` for callers
/// that only need the slug, used by the debug API to look up
/// per-language rule enablement without re-parsing the file.
pub fn lang_slug_for_path(path: &Path) -> Option<&'static str> {
@ -3985,7 +4006,7 @@ pub struct FusedResult {
///
/// When `global_summaries` is `None`, the taint engine runs with local
/// context only (equivalent to pass 1 + partial pass 2). A second call
/// to [`run_taint_only`] can refine findings with the full cross-file view
/// to `run_taint_only` can refine findings with the full cross-file view
/// without re-parsing or re-building the CFG.
pub fn analyse_file_fused(
bytes: &[u8],

View file

@ -2793,7 +2793,7 @@ fn function_params(node: Node<'_>, bytes: &[u8]) -> Vec<String> {
params
}
/// Variant of [`function_params`] that always includes id-like typed
/// Variant of `function_params` that always includes id-like typed
/// Python params (`dag_id: str`, `dag_run_id: str`). Used by
/// `attach_route_handler` to populate `unit.params` for RouteHandler
/// units so middleware-injected auth checks (FastAPI
@ -2802,7 +2802,7 @@ fn function_params(node: Node<'_>, bytes: &[u8]) -> Vec<String> {
/// the id-shaped ones that are *the* primary user-controlled data on
/// REST routes.
///
/// The id-like filter in [`collect_param_names`] exists to keep
/// The id-like filter in `collect_param_names` exists to keep
/// internal helper signatures (`def f(release_id: int, project:
/// Project)`) from passing `unit_has_user_input_evidence`'s param
/// heuristic, which would over-fire `missing_ownership_check`. Route

View file

@ -1,4 +1,60 @@
#![doc = include_str!(concat!(env!("OUT_DIR"), "/auth_analysis.md"))]
//! Missing authorization and ownership checks (Rust-primary).
//!
//! Detects request handlers that reach a privileged operation taking a scoped
//! identifier (`*_id`, row reference, scoped resource) without a preceding
//! ownership or membership check.
//!
//! Other languages have rule scaffolding (`py.auth.*`, `js.auth.*`,
//! `rb.auth.*`, `go.auth.*`, `java.auth.*`) but only Rust has benchmark
//! corpus coverage and validated precision. Treat non-Rust findings as preview.
//!
//! # Rule IDs
//!
//! | Rule ID | Variant |
//! |---------|---------|
//! | `rs.auth.missing_ownership_check` | Standalone structural analyser (default on) |
//! | `rs.auth.missing_ownership_check.taint` | SSA/taint variant via `Cap::UNAUTHORIZED_ID` (default off) |
//!
//! Enable the taint variant via `scanner.enable_auth_as_taint = true` in
//! `nyx.conf`. Run both together when enabled; if both fire for the same site,
//! treat them as the same finding.
//!
//! # What counts as authorization
//!
//! The analyser accepts any of:
//! - A call to a recognised authorization helper (`check_ownership`,
//! `has_permission`, `require_*_member`, etc.; configurable per project).
//! - An ownership-equality check on a row reference
//! (`if owner_id != user.id { return 403 }`).
//! - A self-actor reference from a typed extractor param (`Extension<Session>`,
//! `CurrentUser`, etc.) combined with `user.id` / `user.user_id` use.
//! - A typed policy-guard wrapper (`GuardedData<ActionPolicy<X>, _>`);
//! configured via `policy_guard_names`.
//! - A SQL query joining through an ACL table or filtering by `user_id`
//! predicate (detected without a SQL parser via [`sql_semantics`]).
//! - A helper-summary lift: a called function whose body contains a
//! `require_*_member` call (fixed-point up to 4 iterations).
//!
//! # Sink classification
//!
//! | Class | Examples | Treatment |
//! |-------|---------|-----------|
//! | `InMemoryLocal` | `map.insert`, `vec.push` on local | Never a sink |
//! | `RealtimePublish` | `realtime.publish_to_group` | Sink unless channel scope is ownership-checked |
//! | `OutboundNetwork` | `http.post`, `reqwest::Client::post` | Sink unless sanitizer is on the path |
//! | `CacheCrossTenant` | `redis.set` with scoped keys | Sink unless tenant is checked |
//! | `DbMutation` | `db.insert`, `repo.save` with scoped IDs | Sink unless ownership is established |
//! | `DbCrossTenantRead` | `db.query` returning tenant-scoped rows | Sink unless ACL-join or tenant predicate is present |
//!
//! # Submodules
//!
//! - [`checks`]: ownership-check recognition, actor-context extraction,
//! row-field variable tracking
//! - [`config`]: per-language auth rule defaults and config merging
//! - [`extract`]: handler detection, scoped-ID extraction, summary lifting
//! - [`model`]: `AnalysisUnit`, `AuthCheck`, `SensitiveOperation`, `SinkClass`
//! - [`sql_semantics`]: ACL-join and `user_id`-predicate detection without a
//! SQL parser
pub mod checks;
pub mod config;

View file

@ -253,7 +253,7 @@ pub struct AnalysisUnit {
/// Function parameter names whose static type maps to a
/// payload-incompatible scalar ([`crate::ssa::type_facts::TypeKind::Int`]
/// or [`crate::ssa::type_facts::TypeKind::Bool`]). Populated
/// per-file by [`super::apply_typed_bounded_params`] using the
/// per-file by `apply_typed_bounded_params` using the
/// SSA-derived `VarTypes` map. Consulted by
/// `is_typed_bounded_subject` so parameters like Spring `Long
/// userId`, Axum `Path<i64>`, or FastAPI `user_id: int` are not
@ -265,7 +265,7 @@ pub struct AnalysisUnit {
/// declared type is a payload-incompatible scalar. Map key is the
/// parameter name (e.g. `dto`), value is the list of field names
/// (e.g. `["age", "count"]`). Populated by
/// [`super::apply_typed_bounded_params`] only when the parameter
/// `apply_typed_bounded_params` only when the parameter
/// itself was recognised as a typed extractor, bare parameters
/// with no framework gate never lift their fields.
pub typed_bounded_dto_fields: HashMap<String, Vec<String>>,

View file

@ -1,3 +1,15 @@
//! Whole-program call graph built from pass-1 function summaries.
//!
//! Nodes are [`FuncKey`]s (one per function definition across all files).
//! Edges represent call-site relationships resolved after pass 1 completes.
//! Unresolved and ambiguous callees are tracked separately so they can be
//! surfaced in diagnostics without blocking analysis.
//!
//! [`CallGraphAnalysis`] computes SCCs and topological order. The scanner
//! uses topo order in pass 2 so callees are analysed before their callers,
//! and iterates over SCC groups to a fixed point for mutually recursive
//! functions.
use crate::interop::InteropEdge;
use crate::rust_resolve::RustUseMap;
use crate::summary::{CalleeQuery, CalleeResolution, GlobalSummaries};
@ -55,7 +67,7 @@ pub struct CallGraph {
pub struct CallGraphAnalysis {
/// Strongly connected components.
pub sccs: Vec<Vec<NodeIndex>>,
/// Maps each `NodeIndex` to its SCC index in [`sccs`].
/// Maps each `NodeIndex` to its SCC index in `sccs`.
#[allow(dead_code)] // used for future topo-ordered taint propagation
pub node_to_scc: HashMap<NodeIndex, usize>,
/// SCC indices in **callee-first** (leaves-first) order.
@ -160,7 +172,7 @@ pub(crate) fn callee_container_hint(raw: &str) -> &str {
/// Per-language `(container, method_name)` → candidate [`FuncKey`] index.
///
/// Built once per call-graph construction over every merged
/// [`FuncSummary`]. Used by edge insertion to restrict an indirect method
/// [`crate::summary::FuncSummary`]. Used by edge insertion to restrict an indirect method
/// call (`receiver.method(...)`) to only those targets whose defining
/// container matches the receiver's static type. Without a container
/// hint the index falls back to the bare-name list, matching today's
@ -272,7 +284,7 @@ impl ClassMethodIndex {
///
/// Covers Java `extends`/`implements`, Rust `impl Trait for Type`, TS
/// `extends`/`implements`, Python `class X(Base)`, plus PHP/Ruby/C++
/// (see [`crate::cfg::hierarchy`]). Go's structural interfaces are
/// (see `crate::cfg::hierarchy`). Go's structural interfaces are
/// intentionally omitted, name-only resolution is used instead.
///
/// Container names are bare (no namespace), so cross-namespace aliases
@ -804,7 +816,7 @@ pub fn analyse(cg: &CallGraph) -> CallGraphAnalysis {
/// such SCC has nodes in more than one file (`cross_file`).
///
/// `has_mutual_recursion` triggers the SCC fixed-point loop in
/// [`crate::commands::scan::run_topo_batches`]. `cross_file` is a tighter
/// `run_topo_batches`. `cross_file` is a tighter
/// signal used by joint fixed-point convergence: it implies the
/// recursion involves at least one cross-file call edge, so the inline
/// cache and per-iteration findings need joint convergence, not just

View file

@ -1,3 +1,17 @@
//! Intra-procedural control-flow graph construction.
//!
//! Walks tree-sitter ASTs for all ten supported languages and builds a
//! [`Cfg`] (a petgraph `DiGraph<NodeInfo, EdgeKind>`) per function.
//! [`NodeInfo`] carries the statement kind, label classification, callee
//! name, taint and gate metadata. [`EdgeKind`] distinguishes normal flow,
//! true/false branches, and exception edges.
//!
//! `build_cfg` is the main entry point: given a parsed tree and language,
//! it produces a [`FileCfg`] (one [`Cfg`] per function in the file) along
//! with a [`FuncSummaries`] map for pass-1 summary extraction.
//! `export_summaries` converts in-graph [`LocalFuncSummary`] values to
//! the serializable [`crate::summary::FuncSummary`] form.
#![allow(
clippy::collapsible_if,
clippy::let_and_return,
@ -65,7 +79,7 @@ use params::{
is_configured_terminator,
};
/// Test-only re-export of [`extract_param_meta`] so the external
/// Test-only re-export of `extract_param_meta` so the external
/// `tests/typed_extractors_audit.rs` harness can drive the per-param
/// classifier directly without spinning up the full scan pipeline.
/// Projects away the destructured-siblings third tuple slot so the
@ -675,7 +689,7 @@ pub struct FileCfg {
/// per-file class / trait / interface hierarchy edges.
/// Each entry is `(sub_container, super_container)` after
/// language-specific normalisation. See
/// [`crate::cfg::hierarchy`] for the per-language extraction
/// `crate::cfg::hierarchy` for the per-language extraction
/// rules and [`crate::callgraph::TypeHierarchyIndex`] for the
/// downstream consumer. Empty for languages without an
/// extractor (Go, C) and for files with no inheritance / impl

View file

@ -1,4 +1,50 @@
#![doc = include_str!(concat!(env!("OUT_DIR"), "/cfg_analysis.md"))]
//! CFG structural analysis: dominator-based checks over intra-procedural CFGs.
//!
//! Checks structural properties that the taint engine cannot: whether sinks are
//! guarded by sanitizers or validators, whether web handlers reach privileged
//! sinks without an auth call, whether resources are released on all exit paths,
//! and whether error paths terminate before reaching dangerous code.
//!
//! A guard dominates a sink when the guard must execute before the sink on
//! every path from function entry.
//!
//! # Rule IDs
//!
//! | Rule ID | Severity | What it checks |
//! |---------|----------|----------------|
//! | `cfg-unguarded-sink` | High/Medium | Sink reachable from entry without a matching guard |
//! | `cfg-auth-gap` | High | Web handler reaches privileged sink with no auth call |
//! | `cfg-unreachable-sink` | Medium | Sink in dead code |
//! | `cfg-unreachable-sanitizer` | Low | Sanitizer in dead code (may have been silently disabled) |
//! | `cfg-unreachable-source` | Low | Source in dead code |
//! | `cfg-error-fallthrough` | High/Medium | Error path does not terminate before a dangerous call |
//! | `cfg-resource-leak` | Medium | Resource acquired but not released on all exit paths |
//! | `cfg-lock-not-released` | Medium | Lock acquired but not released on all exit paths |
//!
//! # Recognised guards
//!
//! `validate*`, `sanitize*`, `check_*`, `verify_*`, `assert_*`,
//! `shell_escape`, `html_escape`, `url_encode`, `which`.
//!
//! # Recognised auth names
//!
//! `is_authenticated`, `require_auth`, `check_permission`, `authorize`,
//! `authenticate`, `require_login`, `check_auth`, `verify_token`,
//! `validate_token` (cross-language), plus `isAuthenticated`,
//! `checkPermission`, `hasAuthority`, `hasRole` (Java) and
//! `middleware.auth`, `auth.required` (Go).
//!
//! Custom guards and auth functions can be added as `sanitizer` rules
//! with `cap = "all"` in `nyx.conf`.
//!
//! # Submodules
//!
//! - [`auth`]: auth-gap detection, handler classification
//! - [`dominators`]: dominator tree computation over CFG nodes
//! - [`error_handling`]: error-fallthrough detection
//! - [`guards`]: guard recognition and dominator queries
//! - [`resources`]: resource-leak and lock-not-released detection
//! - [`rules`]: finding construction and rule ID assignment
pub mod auth;
pub mod dominators;

View file

@ -1,3 +1,11 @@
//! Command-line interface definition via clap.
//!
//! Defines [`Cli`] (the top-level parser) and the [`Commands`] enum of
//! subcommands. Helpers on [`Commands`] answer routing questions the binary
//! needs without pattern-matching on specific arms: [`Commands::effective_format`],
//! [`Commands::is_structured_output`], [`Commands::is_serve`], and
//! [`Commands::is_informational`].
use clap::{Parser, Subcommand, ValueEnum};
use serde::{Deserialize, Serialize};
@ -250,7 +258,7 @@ pub enum Commands {
#[arg(long, help_heading = "Output")]
no_rank: bool,
/// Show inline-suppressed findings (dimmed, tagged [SUPPRESSED])
/// Show inline-suppressed findings (dimmed, tagged \[SUPPRESSED\])
#[arg(long, help_heading = "Output")]
show_suppressed: bool,

View file

@ -1,3 +1,11 @@
//! Subcommand handlers and top-level dispatch.
//!
//! [`handle_command`] is the single entry point from `main`. It installs
//! analysis engine options from the resolved config, then routes to the
//! appropriate subcommand module (scan, clean, config, index, list, serve).
//! CLI flags that override config values are applied per-arm before the
//! handler runs.
pub mod clean;
pub mod config;
pub mod index;

View file

@ -117,10 +117,20 @@ fn fail_if_persist_errors(stage: &str, errors: Arc<Mutex<Vec<String>>>) -> NyxRe
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct Diag {
/// Project-relative path of the file containing the finding.
pub path: String,
/// 1-based line number of the sink location.
pub line: usize,
/// 0-based column offset of the sink location.
pub col: usize,
/// Finding severity (Critical / High / Medium / Low / Info).
pub severity: Severity,
/// Rule identifier, e.g. `taint-unsanitised-flow`, `cfg-auth-gap`,
/// `rs.auth.missing_ownership_check`. Taint findings append a
/// source-location suffix (`"taint-unsanitised-flow (source 12:3)"`)
/// so sibling paths with the same sink have distinct IDs for
/// deduplication; [`crate::evidence::Evidence::sink_caps`] disambiguates
/// findings at the same `(path, line, col)` that reach different sinks.
pub id: String,
/// High-level finding category (Security, Reliability, Quality).
pub category: FindingCategory,
@ -871,7 +881,7 @@ static LAST_TOPO_NONRECURSIVE_REFINEMENTS: AtomicUsize = AtomicUsize::new(0);
/// Returns the cumulative count of non-recursive batch refinements
/// (summary + ssa-summary + body + auth inserts) persisted to
/// `global_summaries` during the most recent [`run_topo_batches`] call.
/// `global_summaries` during the most recent `run_topo_batches` call.
/// Reset to zero at the start of each invocation.
pub fn last_topo_nonrecursive_refinements() -> usize {
LAST_TOPO_NONRECURSIVE_REFINEMENTS.load(Ordering::Relaxed)

View file

@ -322,7 +322,7 @@ impl BoolState {
pub struct ValueFact {
/// Exact known constant (Eq constraint). `None` = unconstrained.
pub exact: Option<ConstValue>,
/// Excluded constant values (Neq constraints). Bounded by [`MAX_NEQ`].
/// Excluded constant values (Neq constraints). Bounded by `MAX_NEQ`.
pub excluded: SmallVec<[ConstValue; 4]>,
/// Inclusive lower bound (`None` = −∞).
pub lo: Option<i64>,

View file

@ -204,7 +204,7 @@ pub fn lower_condition(
/// Called during SSA lowering when the full [`SsaBody`] is not yet available.
/// Resolves variables via `var_stacks[name].last()` (the current reaching
/// definition) instead of scanning `value_defs`. Does not use `const_values`
/// (unavailable at lowering time); constants are seeded into [`PathEnv`]
/// (unavailable at lowering time); constants are seeded into [`crate::constraint::PathEnv`]
/// separately via `seed_from_optimization`.
pub fn lower_condition_with_stacks(
cond_info: &NodeInfo,

View file

@ -200,7 +200,7 @@ fn apply_value_const(env: &mut PathEnv, v: crate::ssa::ir::SsaValue, op: CompOp,
/// Resolution order:
/// 1. Cross-language primitive aliases (case-insensitive)
/// 2. Java/Ruby/Go class and framework names (case-sensitive)
/// 3. Java type hierarchy fallback (case-sensitive, via [`TypeHierarchy`])
/// 3. Java type hierarchy fallback (case-sensitive, via [`crate::ssa::type_facts::TypeHierarchy`])
pub fn parse_type_name(name: &str) -> Option<TypeKind> {
use crate::ssa::type_facts::TypeHierarchy;

View file

@ -29,7 +29,7 @@ pub enum ConvergenceEvent {
/// Per-batch record for the SCC fix-point loop.
///
/// Populated once per batch entry in
/// [`crate::commands::scan::run_topo_batches`] that hits the
/// `run_topo_batches` that hits the
/// `has_mutual_recursion` branch.
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct SccBatchRecord {

View file

@ -1,3 +1,14 @@
//! SQLite connection pool and schema for the incremental index.
//!
//! The index stores file content hashes, per-file scan results, and function
//! summaries so subsequent scans can skip files whose content has not changed.
//! The pool is backed by [`r2d2`] with WAL journaling, `synchronous=NORMAL`,
//! and memory-mapped I/O tuned for large codebases.
//!
//! Tables: `files`, `issues`, `function_summaries`, `ssa_function_summaries`.
//! SSA-specific persistence lives in [`crate::summary::ssa_summary`]; routines
//! here cover function summaries and file-level hash bookkeeping.
pub mod index {
#![allow(clippy::too_many_arguments, clippy::type_complexity)]
@ -615,7 +626,7 @@ pub mod index {
})
}
/// Like [`should_scan`] but accepts a pre-computed hash to avoid
/// Like `should_scan` but accepts a pre-computed hash to avoid
/// redundant file reads.
pub fn should_scan_with_hash(&self, path: &Path, hash: &[u8]) -> NyxResult<bool> {
let row: Option<Vec<u8>> = self
@ -673,7 +684,7 @@ pub mod index {
/// (`file_id, rule_id, line, col`) to defend against upstream bugs
/// that produce same-keyed diagnostics with differing severity or
/// cosmetic fields. The first-seen row wins; upstream
/// [`crate::ast::ParsedSource::finalize_diags`] sorts so that high
/// `ParsedSource::finalize_diags` sorts so that high
/// severity comes first, and this fallback preserves that ordering.
pub fn replace_issues<'a>(
&mut self,

View file

@ -1,3 +1,12 @@
//! Error types used throughout the scanner.
//!
//! [`NyxError`] wraps I/O, TOML parse, SQLite, tree-sitter, and connection-pool
//! errors into a single enum. [`NyxResult<T>`] is the standard return type alias.
//!
//! [`ConfigError`] and [`ConfigErrorKind`] carry structured config-validation
//! diagnostics (section, field, message, kind) so callers can format them
//! consistently without ad-hoc string matching.
use serde::Serialize;
use serde::de::StdError;
use std::fmt;

View file

@ -60,10 +60,15 @@ impl FromStr for Confidence {
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FlowStepKind {
/// A source read: user input, environment variable, network data, etc.
Source,
/// A local assignment propagating taint from one variable to another.
Assignment,
/// A function call through which taint flows (via argument or return value).
Call,
/// An SSA phi node merging tainted values from multiple predecessors.
Phi,
/// The dangerous sink where tainted data is consumed.
Sink,
}
@ -82,19 +87,29 @@ impl fmt::Display for FlowStepKind {
/// A single step in a taint flow path (display-ready).
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FlowStep {
/// 1-based position of this step in the flow (source = 1, sink = N).
pub step: u32,
pub kind: FlowStepKind,
/// Project-relative file path where this step occurs.
pub file: String,
/// 1-based line number of the operation.
pub line: u32,
/// 0-based column offset of the operation.
pub col: u32,
/// Source code snippet at this location, if available.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub snippet: Option<String>,
/// SSA variable name carrying taint at this step.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub variable: Option<String>,
/// For [`FlowStepKind::Call`] steps, the name of the function called.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub callee: Option<String>,
/// Name of the enclosing function at this step.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub function: Option<String>,
/// True when this step crosses a file boundary, resolved via a cross-file
/// summary rather than direct SSA flow.
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub is_cross_file: bool,
}

View file

@ -1,3 +1,13 @@
//! Explicit cross-language call-graph bridge edges.
//!
//! Without an [`InteropEdge`], the call graph resolver never attempts
//! cross-language resolution. This prevents false positives from functions
//! in different languages that happen to share a name.
//!
//! An [`InteropEdge`] maps a [`CallSiteKey`] (caller language, file, function,
//! callee symbol, call ordinal) to a [`FuncKey`] in another language. Ordinal
//! `0` acts as a wildcard matching any call of that name from the given caller.
use crate::symbol::{FuncKey, Lang};
/// Identifies a specific call site within a caller function.

View file

@ -1,3 +1,16 @@
//! Per-language source, sanitizer, and sink rule registries.
//!
//! The central type is [`DataLabel`], which pairs a [`Cap`] bitflag set with
//! a role (Source, Sanitizer, Sink). [`LabelRule`] maps AST text patterns to
//! labels. [`classify`] and [`classify_all`] look up a callee name against
//! the active language's rule table; [`classify_gated_sink`] handles
//! argument-role-aware sinks where one argument controls whether the call is
//! dangerous at all.
//!
//! Rules for each language live in per-language submodules (`rust`, `java`,
//! `go`, `python`, `php`, `ruby`, `javascript`, `typescript`, `c`, `cpp`).
//! The [`Cap`] bitflag type is defined here and shared with the taint engine.
mod c;
mod cpp;
mod go;
@ -125,19 +138,58 @@ pub struct SinkGate {
}
bitflags! {
/// Security capability bits for sources, sanitizers, and sinks.
///
/// Each bit represents a security-relevant property. The meaning depends on
/// which role the [`Cap`] value is attached to:
///
/// - **Source**: which attack classes this tainted value can potentially
/// trigger. Sources usually carry [`Cap::all()`] so they match any sink.
/// [`ENV_VAR`](Cap::ENV_VAR) is an exception — it marks origin rather
/// than reach.
/// - **Sanitizer**: which attack classes this function strips. A sanitizer
/// labelled with [`HTML_ESCAPE`](Cap::HTML_ESCAPE) clears the XSS-relevant
/// bits from tainted values that flow through it.
/// - **Sink**: which capability bits must be present on the incoming tainted
/// value for a finding to fire. A SQL sink requires [`SQL_QUERY`](Cap::SQL_QUERY).
///
/// In practice: a finding fires when a tainted value reaches a sink and
/// `(value_caps & sink_caps) != 0`.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct Cap: u16 {
/// Taint that originated from an environment variable read.
/// Used as a source-origin marker for env-injection rules.
const ENV_VAR = 0b0000_0000_0000_0001; // bit 0
/// Sanitizer: the value has passed through HTML entity escaping.
/// Strips XSS risk from values that reach HTML output sinks.
const HTML_ESCAPE = 0b0000_0000_0000_0010; // bit 1
/// Sanitizer: the value has been shell-argument escaped.
/// Strips command-injection risk before shell sinks.
const SHELL_ESCAPE = 0b0000_0000_0000_0100; // bit 2
/// Sanitizer: the value has been percent-encoded for use in a URL.
const URL_ENCODE = 0b0000_0000_0000_1000; // bit 3
/// Sanitizer: the value was parsed through a structured JSON decoder
/// (as opposed to `eval`-based or regex parsing).
const JSON_PARSE = 0b0000_0000_0001_0000; // bit 4
/// Sink: file system read or write operation (path traversal, arbitrary
/// file read/write).
const FILE_IO = 0b0000_0000_0010_0000; // bit 5
/// Sink: format string injection (e.g. `printf`-family, `String.format`).
const FMT_STRING = 0b0000_0000_0100_0000; // bit 6
/// Sink: SQL query construction. Fires for string-concatenated queries
/// and parameterized-query builders where the query text itself is tainted.
const SQL_QUERY = 0b0000_0000_1000_0000; // bit 7
/// Sink: unsafe object deserialization (Java `ObjectInputStream`,
/// Python `pickle`, Ruby `Marshal`, PHP `unserialize`, etc.).
const DESERIALIZE = 0b0000_0001_0000_0000; // bit 8
/// Sink: server-side request forgery. Fires when attacker-controlled
/// data reaches the destination URL of an outbound HTTP request.
const SSRF = 0b0000_0010_0000_0000; // bit 9
/// Sink: code or command execution (shell injection, `eval`, `exec`,
/// dynamic `require`/`import`, template injection).
const CODE_EXEC = 0b0000_0100_0000_0000; // bit 10
/// Sink: cryptographic operation with a tainted algorithm name or seed
/// (weak-crypto / predictable-randomness patterns).
const CRYPTO = 0b0000_1000_0000_0000; // bit 11
/// Request-bound, caller-supplied identifier that has not yet been
/// validated against an ownership/membership check. Used as the
@ -747,7 +799,7 @@ fn phase_c_auth_rules_for_lang(lang_slug: &str) -> Vec<RuntimeLabelRule> {
}
}
/// Public re-export used by [`crate::ast::ParsedFile::from_source`] to
/// Public re-export used by `ParsedFile::from_source` to
/// augment per-file rule sets when imports reveal frameworks that the
/// manifest-level detector missed.
pub fn framework_rules_for_lang_pub(
@ -1207,7 +1259,7 @@ pub fn classify_gated_sink(
out
}
/// Public wrapper for [`normalize_chained_call`] so callers outside the module
/// Public wrapper for `normalize_chained_call` so callers outside the module
/// can share the same normalization used by the label classifier.
pub fn normalize_chained_call_for_classify(text: &str) -> String {
normalize_chained_call(text)

View file

@ -1,14 +1,92 @@
//! Multi-language static vulnerability scanner. Tree-sitter parsing, petgraph
//! CFGs, SSA-based dataflow, and cross-file taint analysis with a
//! capability-based sanitizer system. Supports Rust, C, C++, Java, Go, PHP,
//! Python, Ruby, TypeScript, and JavaScript.
//! Multi-language static vulnerability scanner.
//!
//! The handbook below is embedded verbatim from
//! [`docs/how-it-works.md`](https://github.com/elicpeter/nyx/blob/master/docs/how-it-works.md).
//! Tree-sitter parsing, petgraph CFGs, SSA-based dataflow, and cross-file
//! taint analysis with a capability-based sanitizer system. Supports Rust,
//! C, C++, Java, Go, PHP, Python, Ruby, TypeScript, and JavaScript.
//!
//! This crate is both the `nyx` binary and a library for programmatic
//! scanning. Most internal modules are public for testing and downstream
//! tooling, but the stable contract is [`scan_no_index`] plus the types
//! it returns.
//!
//! For a description of how the analysis pipeline works, see the
//! [how-it-works handbook](https://github.com/elicpeter/nyx/blob/master/docs/how-it-works.md).
//! Per-detector documentation lives on the [`taint`], [`cfg_analysis`],
//! [`state`], [`patterns`], and [`auth_analysis`] modules. The primary
//! library entry point for tests and embedders is [`scan_no_index`].
#![doc = include_str!(concat!(env!("OUT_DIR"), "/lib_intro.md"))]
//! [`state`], [`patterns`], and [`auth_analysis`] module pages.
//!
//! # Entry points
//!
//! [`scan_no_index`] runs a full two-pass scan over a directory tree and
//! returns a flat list of [`commands::scan::Diag`] values. It does not
//! touch a SQLite index; every file is analysed from disk on each call.
//!
//! ```no_run
//! use nyx_scanner::{scan_no_index, utils::Config};
//! use std::path::Path;
//!
//! let config = Config::default();
//! let findings = scan_no_index(Path::new("/path/to/project"), &config).unwrap();
//! for diag in &findings {
//! println!("{} at {}:{}", diag.id, diag.path, diag.line);
//! }
//! ```
//!
//! For incremental rescanning backed by a SQLite index, use
//! [`commands::scan::scan_with_index_parallel`] directly.
//!
//! # Key types
//!
//! | Type | Purpose |
//! |------|---------|
//! | [`utils::config::Config`] | Top-level scanner config (load from `nyx.conf` or construct in code) |
//! | [`commands::scan::Diag`] | A single finding: location, severity, rule ID, structured evidence |
//! | [`evidence::Evidence`] | Source/sink spans, flow steps, sanitizer annotations, engine notes |
//! | [`evidence::Confidence`] | Low / Medium / High confidence tag |
//! | [`labels::Cap`] | Bitflag capability set describing what a taint flow can reach |
//! | [`symbol::Lang`] | Supported language enum |
//! | [`symbol::FuncKey`] | Canonical cross-file function identity |
//!
//! # Reading findings
//!
//! Each [`commands::scan::Diag`] carries:
//!
//! - `path`, `line`, `col` — source location of the sink
//! - `id` — rule identifier (e.g. `taint-unsanitised-flow`, `cfg-auth-gap`)
//! - `severity` — Critical / High / Medium / Low / Info
//! - `confidence` — Low / Medium / High; capped at Medium when an engine
//! budget was hit
//! - `rank_score` — deterministic attack-surface score for truncation ordering
//! - `evidence` — optional [`evidence::Evidence`] with source/sink spans,
//! flow steps, and [`engine_notes::EngineNote`] values describing precision loss
//!
//! Engine notes communicate when a bound was hit. A finding carrying
//! `EngineNote::OriginsTruncated` or `EngineNote::SccBudgetExhausted` is
//! still real, but the engine had less information than it would have had
//! without the cap.
//!
//! # Module map
//!
//! | Module | Role |
//! |--------|------|
//! | [`ast`] | Tree-sitter parsing and two-pass analysis dispatch |
//! | [`mod@cfg`] | CFG construction from ASTs |
//! | [`ssa`] | SSA lowering and optimization passes |
//! | [`taint`] | Forward SSA taint analysis |
//! | [`cfg_analysis`] | Structural CFG checks (auth gaps, resource leaks, error paths) |
//! | [`state`] | Resource lifecycle and state-machine analysis |
//! | [`patterns`] | Pattern-based AST checks |
//! | [`auth_analysis`] | Missing authorization / ownership checks |
//! | [`callgraph`] | Whole-program call graph and SCC analysis |
//! | [`summary`] | Per-function summaries for cross-file resolution |
//! | [`labels`] | Source, sanitizer, and sink rule registries per language |
//! | [`symex`] | Symbolic execution for witness generation and path feasibility |
//! | [`abstract_interp`] | Interval and string bounds propagation for sink suppression |
//! | [`constraint`] | Path constraint solving and infeasible-path pruning |
//! | [`evidence`] | Finding provenance and confidence types |
//! | [`suppress`] | Inline `nyx:ignore` directive handling |
//! | [`output`] | JSON and SARIF serialization |
//! | [`database`] | SQLite index pool and schema |
//! | [`walk`] | Filesystem traversal with batched delivery |
pub mod abstract_interp;
pub mod ast;
@ -48,8 +126,19 @@ use errors::NyxResult;
use std::path::Path;
use utils::config::Config;
/// Run a two-pass scan without index (filesystem only).
/// This is the primary entry point for integration tests.
/// Run a two-pass scan over `root` without an incremental index.
///
/// Every file under `root` is analysed from disk on each call; no SQLite
/// state is read or written. The walker respects `.gitignore` files when
/// `cfg.scanner.read_vcsignore` is true (the default), skips hidden files
/// and symlinks unless the config enables them, and excludes the directories
/// and extensions listed in `cfg.scanner.excluded_*`.
///
/// Returns one [`commands::scan::Diag`] per finding. The list is unsorted;
/// call [`rank::rank_diags`] if you need findings ordered by exploitability.
///
/// For indexed / incremental rescanning use
/// [`commands::scan::scan_with_index_parallel`] instead.
pub fn scan_no_index(root: &Path, cfg: &Config) -> NyxResult<Vec<commands::scan::Diag>> {
commands::scan::scan_filesystem(root, cfg, false)
}

View file

@ -1,3 +1,12 @@
//! Finding serialization and output routing.
//!
//! Serializes [`crate::commands::scan::Diag`] values to console, JSON, or
//! SARIF based on the requested format. `PATTERN_DESCRIPTIONS` is a
//! lazily-built map from pattern ID to human-readable description, populated
//! from all language registries on first access. `sarif_base_id` normalizes
//! source-location-suffixed finding IDs (like `"taint-unsanitised-flow (source 12:3)"`)
//! to the canonical SARIF rule ID form.
use crate::commands::scan::Diag;
use crate::patterns::{self, Severity};
use once_cell::sync::Lazy;

View file

@ -1,4 +1,52 @@
#![doc = include_str!(concat!(env!("OUT_DIR"), "/patterns.md"))]
//! AST pattern matching: tree-sitter queries over dangerous structural shapes.
//!
//! Patterns match constructs based on syntax alone, with no dataflow or CFG.
//! A match means the construct is present; it is not proof that it is
//! reachable or exploitable. Patterns run in every analysis mode and are the
//! only active detector in `--mode ast`.
//!
//! # Rule ID format
//!
//! ```text
//! <lang>.<category>.<name>
//! ```
//!
//! Examples: `js.code_exec.eval`, `py.deser.pickle_loads`, `c.memory.gets`,
//! `java.sqli.execute_concat`.
//!
//! # Tiers
//!
//! - **Tier A**: structural presence alone is high-signal. `gets`, `eval`,
//! `pickle.loads`, `mem::transmute`. No guard needed.
//! - **Tier B**: pattern includes a tree-sitter heuristic guard.
//! `java.sqli.execute_concat` fires only when `executeQuery` receives a
//! `binary_expression` (concatenation), not a literal or parameterized call.
//!
//! # Categories
//!
//! | Category | Examples |
//! |----------|---------|
//! | `CommandExec` | `system`, `os.system`, `Runtime.exec`, backticks |
//! | `CodeExec` | `eval`, `Function`, PHP `assert("string")`, `class_eval` |
//! | `Deserialization` | `pickle.loads`, `yaml.load`, `Marshal.load`, `readObject` |
//! | `SqlInjection` | `executeQuery` with concatenated argument (Tier B) |
//! | `PathTraversal` | PHP `include $var` |
//! | `Xss` | `innerHTML`, `document.write`, `insertAdjacentHTML` |
//! | `Crypto` | `md5`, `sha1`, `Math.random` for security use |
//! | `Secrets` | Hardcoded API keys (Go, JS, TS) |
//! | `InsecureTransport` | `InsecureSkipVerify`, `fetch("http://...")` |
//! | `Reflection` | `Class.forName`, `Method.invoke`, `constantize` |
//! | `MemorySafety` | `transmute`, `unsafe`, `gets`, `strcpy`, `sprintf` |
//! | `Prototype` | `__proto__` assignment, `Object.prototype.*` |
//! | `Config` | CORS dynamic origin, `rejectUnauthorized: false` |
//! | `CodeQuality` | `unwrap`, `panic!`, `as any` |
//!
//! # Pattern loading
//!
//! Each language submodule exports a `patterns()` function returning
//! `&'static [Pattern]`. [`load`] dispatches to the correct submodule by
//! language slug. [`Pattern`] carries the rule ID, severity, confidence,
//! category, and the tree-sitter query string.
pub mod c;
pub mod cpp;

View file

@ -43,7 +43,7 @@ fn is_container_read_callee(callee: &str) -> bool {
)
}
/// Container-write callees, mirror of [`is_container_read_callee`].
/// Container-write callees, mirror of `is_container_read_callee`.
pub fn is_container_write_callee(callee: &str) -> bool {
let bare = match callee.rsplit_once('.') {
Some((_, m)) => m,
@ -66,7 +66,7 @@ pub fn is_container_write_callee(callee: &str) -> bool {
)
}
/// Public re-export of [`is_container_read_callee`] for the taint engine.
/// Public re-export of `is_container_read_callee` for the taint engine.
pub fn is_container_read_callee_pub(callee: &str) -> bool {
is_container_read_callee(callee)
}
@ -92,7 +92,7 @@ pub fn is_container_read_callee_pub(callee: &str) -> bool {
///
/// Receiver (`SelfParam`) reads/writes are recorded under the
/// [`u32::MAX`] sentinel parameter index, mirroring the convention in
/// [`crate::summary::ssa_summary::SsaFuncSummary::receiver_to_*`].
/// `SsaFuncSummary::receiver_to_*` fields.
///
/// The container-element sentinel field [`FieldId::ELEM`] is recorded
/// under the special name `"<elem>"` so callers can recognise the

View file

@ -10,7 +10,7 @@
//! - PointsToSet is bounded to `analysis.engine.max_pointsto` entries
//! (default 32, widening on overflow, see [`effective_max_pointsto`]).
//! Overflow drops emit an [`crate::engine_notes::EngineNote::PointsToTruncated`]
//! note and increment [`POINTSTO_TRUNCATION_COUNT`] so operators can
//! note and increment `POINTSTO_TRUNCATION_COUNT` so operators can
//! tell when the cap is firing on their corpus.
//! - HeapState tracks per-(heap-object, slot) taint (monotone lattice)
//! - HeapSlot::Index(u64) for constant-index container access (proven by const propagation)
@ -168,7 +168,7 @@ impl PointsToSet {
///
/// Truncates to [`effective_max_pointsto`]; any heap-object member
/// that would be admitted after the cap is reached is dropped and
/// counted via [`record_pointsto_truncation`]. Truncation is
/// counted via `record_pointsto_truncation`. Truncation is
/// deterministic: the merge proceeds in sorted order, so survivors
/// are always the smallest `HeapObjectId`s across the two inputs.
pub fn union(&self, other: &Self) -> Self {
@ -230,7 +230,7 @@ impl PointsToSet {
///
/// When the set is already at [`effective_max_pointsto`], the new id
/// is dropped and the drop is counted via
/// [`record_pointsto_truncation`].
/// `record_pointsto_truncation`.
pub fn insert(&mut self, id: HeapObjectId) {
match self.ids.binary_search(&id) {
Ok(_) => {} // already present

View file

@ -1,3 +1,21 @@
//! SSA IR, lowering, and optimization passes.
//!
//! The pipeline converts a CFG into a pruned SSA body consumed by the taint
//! analysis engine. [`lower_to_ssa`] inserts phi nodes via Cytron's algorithm
//! and renames variables along the dominator tree. [`optimize_ssa`] runs
//! constant propagation, branch pruning, copy propagation, DCE, and type
//! fact analysis in sequence.
//!
//! Key submodules:
//! - [`ir`]: core types (`SsaValue`, `SsaOp`, `SsaInst`, `SsaBlock`, `SsaBody`)
//! - [`lower`]: CFG-to-SSA lowering with Cytron phi insertion and dominator-tree rename
//! - [`const_prop`]: sparse conditional constant propagation with branch pruning
//! - [`copy_prop`]: copy and alias propagation
//! - [`dce`]: dead definition elimination
//! - [`type_facts`]: per-value type inference (`TypeKind`, `TypeFactResult`)
//! - [`heap`]: abstract heap for container element abstractions
//! - [`alias`]: base-variable alias groups from copy propagation
#[allow(dead_code)] // IR types, fields used by Display impl, tests, and downstream analyses
pub mod alias;
pub mod const_prop;

View file

@ -25,7 +25,7 @@
//!
//! The analysis is **flow-insensitive** and **bounded**: it does not
//! reason about path feasibility, and it stops adding edges once the
//! summary's [`MAX_ALIAS_EDGES`] cap is reached, the overflow flag is
//! summary's `MAX_ALIAS_EDGES` cap is reached, the overflow flag is
//! the conservative fallback that callers honour.
use std::collections::{HashMap, HashSet};
@ -239,7 +239,7 @@ fn returns_fresh_allocation(
/// `formal_param_count` bounds the parameter indices written to the
/// summary: scoped lowering synthesises `Param` ops for module-level
/// captures at indices beyond the formal arity, and those must not leak
/// into the summary (they would trip [`crate::summary::ssa_summary_fits_arity`]).
/// into the summary (they would trip `ssa_summary_fits_arity`).
pub fn analyse_param_points_to(
ssa: &SsaBody,
param_info: &[(usize, String, SsaValue)],

View file

@ -1,4 +1,55 @@
#![doc = include_str!(concat!(env!("OUT_DIR"), "/state.md"))]
//! State-model analysis: resource lifecycle and authentication state tracking.
//!
//! Runs a per-function state machine over the CFG to detect use-after-close,
//! double-close, resource leaks, and unauthenticated access to privileged
//! operations.
//!
//! Enabled by default. Disable via `scanner.enable_state_analysis = false`.
//! Runs in `--mode full` and `--mode taint`; skipped in AST-only mode.
//!
//! # Rule IDs
//!
//! | Rule ID | Severity | What it detects |
//! |---------|----------|-----------------|
//! | `state-use-after-close` | High | Operation on a resource after it was closed |
//! | `state-double-close` | Medium | Resource closed twice |
//! | `state-resource-leak` | Medium | Resource opened and never closed on any path |
//! | `state-resource-leak-possible` | Low | Resource closed on some paths but not others |
//! | `state-unauthed-access` | High | Web handler reaches privileged sink without an auth call |
//!
//! # Managed-resource suppression
//!
//! Language-specific cleanup patterns suppress leak findings automatically:
//!
//! | Pattern | Languages |
//! |---------|-----------|
//! | RAII / Drop | Rust (all leak findings suppressed except `alloc`/`dealloc`) |
//! | Smart pointers (`make_unique`, `make_shared`) | C++ |
//! | `defer f.Close()` | Go |
//! | `with open(f) as f:` | Python |
//! | try-with-resources | Java |
//!
//! # Tracked acquire/release pairs
//!
//! C/C++: `fopen`/`fclose`, `open`/`close`, `socket`/`close`,
//! `malloc`/`free`, `pthread_mutex_lock`/`pthread_mutex_unlock`,
//! `new`/`delete`.
//!
//! Rust: `File::open`/`close`, `TcpStream::connect`/`shutdown`,
//! mutex `lock`/`read`/`write`/`drop`.
//!
//! Java: stream/connection/socket constructors / `close`, `getConnection`/`close`.
//!
//! Go, Python, JavaScript, Ruby, PHP follow language-idiomatic equivalents.
//!
//! # Submodules
//!
//! - [`domain`]: state lattice (`ResourceState`, `AuthState`, `StateCell`)
//! - [`engine`]: generic forward transfer engine (`Transfer` trait, `run_forward`)
//! - [`facts`]: per-node state fact extraction
//! - [`lattice`]: lattice join/meet for state values
//! - [`symbol`]: resource symbol normalisation
//! - [`transfer`]: `DefaultTransfer` — the concrete resource-lifecycle transfer function
pub mod domain;
pub mod engine;

View file

@ -1,3 +1,20 @@
//! Per-function summaries for cross-file taint analysis.
//!
//! [`FuncSummary`] describes a function's boundary behaviour: which parameters
//! flow to sinks, which sources it reads, whether it propagates taint from
//! arguments to its return value, and what capabilities it strips. Summaries
//! are serialized to SQLite in pass 1 and merged into [`GlobalSummaries`]
//! before pass 2 begins.
//!
//! [`crate::summary::ssa_summary::SsaFuncSummary`] is a richer summary
//! derived from the SSA taint engine and takes precedence over [`FuncSummary`]
//! during call resolution. `GlobalSummaries::ssa_by_key` stores SSA summaries
//! keyed by [`FuncKey`]; `GlobalSummaries::by_name` holds the fallback
//! name-keyed map for cases where an exact key is not found.
//!
//! Same-name collisions across files are merged conservatively: capabilities
//! are unioned and booleans are OR-ed so no true positive is silently dropped.
pub mod points_to;
pub mod ssa_summary;
@ -669,7 +686,7 @@ impl GlobalSummaries {
/// drop one of the two summaries entirely.
///
/// We therefore inspect the existing entry first. If the new summary
/// is not [`summaries_compatible`] with it, we mint a synthetic
/// is not `summaries_compatible` with it, we mint a synthetic
/// disambig (top bit set to stay disjoint from byte-offset disambigs)
/// and retry the insert under the fresh key so *both* functions are
/// preserved.
@ -1065,7 +1082,7 @@ impl GlobalSummaries {
/// Snapshot the SSA summaries for convergence detection.
///
/// Used alongside [`snapshot_caps`] in the SCC fixed-point loop so that
/// Used alongside [`Self::snapshot_caps`] in the SCC fixed-point loop so that
/// SSA-only refinements (e.g. a `StripBits` transform appearing after a
/// cross-file sanitizer is resolved) are not invisible to convergence.
pub fn snapshot_ssa(&self) -> &HashMap<FuncKey, SsaFuncSummary> {
@ -1090,7 +1107,7 @@ impl GlobalSummaries {
/// 2. Otherwise, for each wildcard prefix in scope, try
/// `(wildcard_prefix, name)` in the module index. If across all
/// wildcards exactly one arity-filtered candidate appears → resolved.
/// 3. Otherwise fall through to [`resolve_callee_key_with_container`]
/// 3. Otherwise fall through to [`Self::resolve_callee_key_with_container`]
/// with no `container_hint`, meaning only the existing namespace /
/// arity disambiguation applies.
///
@ -1168,9 +1185,9 @@ impl GlobalSummaries {
/// Resolve a bare (already-normalized) callee name to a [`FuncKey`].
///
/// Thin wrapper around [`resolve_callee`] that constructs a minimal
/// Thin wrapper around [`Self::resolve_callee`] that constructs a minimal
/// [`CalleeQuery`] with no qualified hints. Kept for call sites that
/// only hold a string callee and an arity; prefer [`resolve_callee`]
/// only hold a string callee and an arity; prefer [`Self::resolve_callee`]
/// whenever receiver / qualifier / container information is available.
pub fn resolve_callee_key(
&self,
@ -1197,7 +1214,7 @@ impl GlobalSummaries {
/// unchanged. `container_hint` is interpreted as a syntactic
/// container qualifier (not an authoritative receiver type), so a
/// miss is allowed to fall through to leaf-name lookup. New
/// callers should route through [`resolve_callee`] and classify
/// callers should route through [`Self::resolve_callee`] and classify
/// their hint as `receiver_type` vs `namespace_qualifier` vs
/// `receiver_var` so the resolver can apply the correct policy.
pub fn resolve_callee_key_with_container(

View file

@ -22,7 +22,7 @@
//! Mutation is observable to the caller through its argument for `j`.
//! * `Source(Param(i)) → Target(Return)`, the return value aliases
//! parameter `i`'s heap identity. Adds heap-level precision on top of
//! the coarser [`TaintTransform::Identity`] view already carried in
//! the coarser [`crate::summary::ssa_summary::TaintTransform::Identity`] view already carried in
//! [`crate::summary::ssa_summary::SsaFuncSummary::param_to_return`].
//!
//! `MustAlias` is intentionally omitted, the ROI on
@ -105,7 +105,7 @@ pub const MAX_ALIAS_EDGES: usize = 8;
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct PointsToSummary {
/// Bounded edge list, deduped by `(source, target, kind)`. The
/// [`serde(default)`] attribute lets summaries pre-dating points-to
/// `#[serde(default)]` attribute lets summaries pre-dating points-to
/// tracking deserialise cleanly (no edges).
#[serde(default, skip_serializing_if = "SmallVec::is_empty")]
pub edges: SmallVec<[AliasEdge; 4]>,
@ -193,7 +193,7 @@ impl PointsToSummary {
}
/// Parameter indices referenced by any edge in this summary. Used by
/// [`crate::summary::ssa_summary_fits_arity`] to confirm the summary
/// `ssa_summary_fits_arity` to confirm the summary
/// does not reference a parameter beyond the key's declared arity
/// (which would indicate a synthetic-param mis-attribution in
/// extraction).

View file

@ -165,7 +165,7 @@ pub struct SsaFuncSummary {
/// [`crate::cfg::CallMeta::gate_filters`] carries more than one entry
/// (e.g. `fetch` is both an `SSRF` gate on the URL arg and a
/// `DATA_EXFIL` gate on the body arg), the multi-gate dispatch in
/// [`super::super::collect_block_events`] cap-narrows the event's
/// `collect_block_events` cap-narrows the event's
/// `sink_caps` to the specific gate's `label_caps`. Each
/// `(param_idx, label_caps)` entry records that this function's
/// parameter `param_idx` flowed into a gated sink whose narrowed
@ -195,7 +195,7 @@ pub struct SsaFuncSummary {
/// (e.g., function returns the same container it received as input).
///
/// Populated by
/// [`crate::taint::ssa_transfer::summary_extract::extract_container_flow_summary`]
/// `extract_container_flow_summary`
/// and applied at cross-file call sites to propagate the caller's
/// points-to set for that argument onto the call's return SSA value.
#[serde(default)]
@ -205,7 +205,7 @@ pub struct SsaFuncSummary {
/// (e.g., `fn storeInto(value, arr) { arr.push(value); }` → `[(0, 1)]`).
///
/// Populated by
/// [`crate::taint::ssa_transfer::summary_extract::extract_container_flow_summary`]
/// `extract_container_flow_summary`
/// and applied at cross-file call sites by writing the caller's taint on
/// the `src_param` argument into the heap objects pointed to by the
/// `container_param` argument.
@ -254,7 +254,7 @@ pub struct SsaFuncSummary {
/// Per-parameter return-path decomposition.
///
/// When non-empty, supplies finer-grained per-path data than
/// [`Self::param_to_return`]. Each parameter maps to up to
/// `param_to_return`. Each parameter maps to up to
/// [`MAX_RETURN_PATHS`] [`ReturnPathTransform`] entries, one per
/// distinct path-predicate gate. Callers consult their own predicate
/// state at the call site and apply only entries whose predicate is
@ -262,7 +262,7 @@ pub struct SsaFuncSummary {
/// set into the effective call-site transform.
///
/// Empty when the callee has a single return path, the aggregate
/// [`param_to_return`] is already precise, or when extraction
/// `param_to_return` is already precise, or when extraction
/// could not derive per-return state (e.g. early-exit probes).
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub param_return_paths: Vec<(usize, SmallVec<[ReturnPathTransform; 2]>)>,
@ -338,7 +338,7 @@ pub struct SsaFuncSummary {
/// control would not reach the post-call instruction.
///
/// Populated by
/// [`crate::taint::ssa_transfer::summary_extract::extract_ssa_func_summary`]
/// `extract_ssa_func_summary`
/// when a per-parameter probe shows the parameter's `var_name` in
/// `validated_must` at every return block of the helper. Empty
/// (the default) for helpers that do not validate any parameter.

View file

@ -1,3 +1,15 @@
//! Core language and function identity types.
//!
//! [`Lang`] is the 10-language enum (Rust, C, C++, Java, Go, PHP, Python,
//! Ruby, TypeScript, JavaScript). [`FuncKey`] is the canonical cross-file
//! function identity: name, arity, language, container (class/struct/module),
//! and an optional disambiguator for overloaded functions.
//!
//! [`FuncKey`] is the node type in the call graph and the lookup key in
//! [`crate::summary::GlobalSummaries`]. [`FuncKind`] distinguishes constructors,
//! methods, closures, and free functions so callers can apply language-specific
//! resolution heuristics.
use serde::{Deserialize, Serialize};
use std::fmt;

View file

@ -102,7 +102,7 @@ pub struct FieldAccessRecord {
/// Bounded symbolic heap tracking field-level symbolic values and taint.
///
/// Cloned at fork points during multi-path exploration. Bounded
/// by [`MAX_HEAP_ENTRIES`] total entries and [`MAX_FIELDS_PER_OBJECT`] per
/// by `MAX_HEAP_ENTRIES` total entries and `MAX_FIELDS_PER_OBJECT` per
/// object to prevent blowup on object-heavy code.
#[derive(Clone, Debug)]
pub struct SymbolicHeap {
@ -126,8 +126,8 @@ impl SymbolicHeap {
/// Store a symbolic value into a heap field.
///
/// Bounded: silently drops the store if [`MAX_HEAP_ENTRIES`] or
/// [`MAX_FIELDS_PER_OBJECT`] would be exceeded. `Index(*)` entries are
/// Bounded: silently drops the store if `MAX_HEAP_ENTRIES` or
/// `MAX_FIELDS_PER_OBJECT` would be exceeded. `Index(*)` entries are
/// bounded by [`MAX_TRACKED_INDICES`] per object; overflow collapses all
/// indexed entries into `Elements`.
pub fn store(&mut self, key: HeapKey, value: SymbolicValue, tainted: bool) {

View file

@ -149,7 +149,7 @@ pub struct BackwardsCtx<'a> {
/// Language tag for source-kind heuristics (e.g. `os.getenv` hints).
pub lang: Lang,
/// Whole-program summaries: used to discover cross-file bodies and
/// [`SsaFuncSummary`] metadata at call instructions.
/// [`crate::summary::ssa_summary::SsaFuncSummary`] metadata at call instructions.
pub global_summaries: Option<&'a GlobalSummaries>,
/// Pre-lowered intra-file callee bodies keyed by [`FuncKey`]. Shared
/// with the forward path so we do not lower functions twice.

View file

@ -1,5 +1,78 @@
//! Forward SSA taint analysis: the primary vulnerability detection engine.
//!
//! Tracks untrusted data from **sources** (where it enters the program) through
//! assignments and calls to **sinks** (where it is used dangerously). A finding
//! fires when the flow reaches a sink without passing a matching **sanitizer**.
//!
//! The engine is a monotone forward dataflow over a finite lattice with
//! guaranteed termination. It is flow-sensitive within a function and
//! interprocedural across files via persisted [`crate::summary::FuncSummary`]
//! and [`crate::summary::ssa_summary::SsaFuncSummary`] values.
//!
//! # Rule ID
//!
//! ```text
//! taint-unsanitised-flow (source <line>:<col>)
//! taint-data-exfiltration (source <line>:<col>)
//! ```
//!
//! The source location is part of the ID so sibling paths to the same sink
//! get distinct IDs. Suppressions can target either the base ID or the full
//! string.
//!
//! # Capabilities
//!
//! Sources, sanitizers, and sinks are linked by [`crate::labels::Cap`] bits.
//! A sanitizer only clears the cap it declares; a sink only fires when the
//! remaining taint still carries its required cap.
//!
//! | Cap | Typical source | Typical sanitizer | Typical sink |
//! |-----|----------------|-------------------|--------------|
//! | `env_var` | `env::var`, `getenv`, `process.env` | | |
//! | `html_escape` | | `html.escape`, `DOMPurify.sanitize` | `innerHTML`, `document.write` |
//! | `shell_escape` | | `shlex.quote`, `shell_escape::escape` | `system`, `Command::new` |
//! | `url_encode` | | `encodeURIComponent` | HTTP client URL arg |
//! | `file_io` | | `realpath`, `filepath.Clean` | `open`, `fs::read_to_string` |
//! | `sql_query` | | parameterized query binders | `cursor.execute`, `db.query` |
//! | `deserialize` | | | `pickle.loads`, `Marshal.load` |
//! | `ssrf` | | URL-prefix locks | `fetch` URL arg, outbound HTTP |
//! | `code_exec` | | | `eval`, `exec`, `system` |
//! | `crypto` | | | weak-algorithm constructors |
//! | `data_exfil` | cookies, headers, env, db rows (Sensitive tier) | | `fetch` body/json/headers |
//!
//! Sources typically carry `Cap::all()` so they match any sink.
//!
//! # Source sensitivity
//!
//! Each source carries a [`crate::labels::SourceKind`] and a derived tier:
//!
//! - `Plain` — direct attacker input (`UserInput`): request bodies, query
//! strings, argv, stdin.
//! - `Sensitive` — operator-bound state: cookies, headers, env, files, DB rows,
//! caught exceptions.
//!
//! `Cap::DATA_EXFIL` only fires on `Sensitive`-tier sources. Plain user input
//! flowing into an outbound request body is suppressed — the canonical false
//! positive for API gateways that proxy `req.body`.
//!
//! # Confidence signals
//!
//! Higher confidence: source and sink both present in evidence, `source_kind:
//! user_input`, `path_validated: false`, symbolic witness produced.
//!
//! Lower confidence: path-validated taint, source is a database read or
//! internal file, engine note `ForwardBailed` / `PathWidened`.
//!
//! # Submodules
//!
//! - [`domain`]: taint lattice types (`VarTaint`, `TaintOrigin`, `SmallBitSet`,
//! `PredicateSummary`)
//! - [`ssa_transfer`]: SSA taint transfer functions and the forward worklist
//! (`SsaTaintState`, `SsaTaintTransfer`, `run_ssa_taint`)
//! - [`path_state`]: predicate classification for branch-sensitive propagation
//! - [`backwards`]: demand-driven backwards walk from sinks (off by default)
#![allow(clippy::collapsible_if, clippy::too_many_arguments)]
#![doc = include_str!(concat!(env!("OUT_DIR"), "/taint.md"))]
pub mod backwards;
pub mod domain;

View file

@ -32,7 +32,7 @@ pub enum PredicateKind {
ShellMetaValidated,
/// Bounded-length rejection: `x.len() > N` / `x.length < N` with N >= 2.
///
/// Commonly paired with [`ShellMetaValidated`] in OR-chain rejection
/// Commonly paired with `ShellMetaValidated` in OR-chain rejection
/// idioms (`if x.len() > MAX || x.contains(";") { reject }`). Counts as
/// a dominator guard for `cfg-unguarded-sink` purposes, but intentionally
/// does **not** mark variables as validated, the rejection direction is

View file

@ -71,14 +71,14 @@ pub struct SsaTaintTransfer<'a> {
/// The [`BodyId`] of the body currently being analysed. Used as the
/// owning scope when writing seed entries that leave this body
/// (e.g. [`extract_ssa_exit_state`]) and as the identity recorded on
/// engine notes. Defaults to [`BodyId(0)`] (top-level) for inline
/// engine notes. Defaults to `BodyId(0)` (top-level) for inline
/// probes and unit tests that analyse a single synthetic body.
pub owner_body_id: BodyId,
/// The [`BodyId`] of this body's lexical parent, if any. Drives the
/// `Param`-op reader's lookup into [`Self::global_seed`]: we read
/// from the parent's scope first (the seed entries produced by
/// [`extract_ssa_exit_state`] on the parent body), then fall back to
/// [`BodyId(0)`] to pick up JS/TS two-level re-keyed entries (see
/// `BodyId(0)` to pick up JS/TS two-level re-keyed entries (see
/// [`filter_seed_to_toplevel`]). `None` for the top-level body and
/// for probes with no surrounding scope.
pub parent_body_id: Option<BodyId>,
@ -176,7 +176,7 @@ pub struct SsaTaintTransfer<'a> {
/// to detect handler-style flows that have no registered caller.
pub auto_seed_handler_params: bool,
/// Cross-file callee bodies sourced from
/// [`GlobalSummaries::bodies_iter`]. Populated in pass 2 to enable
/// [`GlobalSummaries`]. Populated in pass 2 to enable
/// context-sensitive inline re-analysis across file boundaries the
/// same way `callee_bodies` enables it intra-file. `None` preserves
/// non-cross-file behaviour for unit tests and non-cross-file

View file

@ -366,7 +366,7 @@ pub struct SsaTaintState {
/// = false`).
pub abstract_state: Option<AbstractState>,
/// per-heap-field taint cells, keyed by
/// `(parent_loc, field)`. Sorted by [`FieldTaintKey`] for O(n)
/// `(parent_loc, field)`. Sorted by `FieldTaintKey` for O(n)
/// merge-join. Populated only when the body's
/// [`crate::pointer::PointsToFacts`] is available
/// (`NYX_POINTER_ANALYSIS=1`); empty otherwise so the lattice join
@ -375,7 +375,7 @@ pub struct SsaTaintState {
/// them. Cross-call propagation lands during lowering via the
/// field-granularity `PointsToSummary`.
///
/// Cell shape: [`FieldCell`] carries `taint` plus
/// Cell shape: `FieldCell` carries `taint` plus
/// `validated_must` / `validated_may` flags so validation flows
/// through abstract field / element identity.
pub field_taint: SmallVec<[(FieldTaintKey, FieldCell); 4]>,
@ -405,7 +405,7 @@ impl SsaTaintState {
/// read the field cell at `key`. Returns `None`
/// when no cell has been recorded (caller should treat as
/// untainted). O(log n) on the sorted [`field_taint`] list.
/// untainted). O(log n) on the sorted `field_taint` list.
pub fn get_field(&self, key: FieldTaintKey) -> Option<&FieldCell> {
self.field_taint
.binary_search_by_key(&key, |(k, _)| *k)

View file

@ -33,7 +33,7 @@ const MAX_PROBE_PARAMS: usize = 8;
/// Extract a precise per-parameter `SsaFuncSummary` from an already-lowered SSA body.
///
/// For each parameter (up to [`MAX_PROBE_PARAMS`]), runs a taint probe by seeding
/// For each parameter (up to `MAX_PROBE_PARAMS`), runs a taint probe by seeding
/// that parameter with `Cap::all()` via `global_seed` and observing what caps
/// survive to return positions and which sinks fire. A final probe with no params
/// tainted detects intrinsic source caps.

View file

@ -713,6 +713,22 @@ fn builtin_profile(name: &str) -> Option<ScanProfile> {
})
}
/// Top-level scanner configuration.
///
/// Loaded from `nyx.conf` (TOML) via [`Config::load`], or constructed in
/// code for embedded use. [`Config::default`] gives conservative defaults:
/// no symlink following, no hidden files, gitignore respected, 10 s parse
/// timeout, all analysis passes on.
///
/// Config sections mirror `nyx.conf` sections:
/// - [`scanner`](Config::scanner): what files to scan, which analysis passes
/// to enable, severity floor
/// - [`output`](Config::output): format, ranking, LOW-finding budgets
/// - [`analysis`](Config::analysis): per-language rules, engine-pass toggles
/// - [`performance`](Config::performance): thread count, depth limit, batch
/// size
/// - [`database`](Config::database): incremental index settings
/// - [`detectors`](Config::detectors): per-detector sensitivity knobs
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(default)]
#[derive(Default)]

View file

@ -1,3 +1,16 @@
//! Shared utilities and configuration.
//!
//! Re-exports [`Config`], [`AnalysisOptions`], and [`DetectorOptions`] from
//! their submodules. [`Config`] is loaded from `nyx.conf` and passed through
//! the top-level call stack. [`AnalysisOptions`] is installed once per process
//! via an `OnceLock` and read back via [`analysis_options::current`] from deep
//! inside the analysis pipeline without threading it through every call frame.
//!
//! Other submodules: `path` (root-relative path utilities and traversal guards),
//! `project` (framework detection, project metadata), `query_cache` (cached
//! tree-sitter query compilation), `snippet` (source snippet extraction for
//! finding locations).
pub mod analysis_options;
pub mod config;
pub mod detector_options;

View file

@ -1,3 +1,13 @@
//! Filesystem walker with batched path delivery.
//!
//! Builds an [`ignore`]-crate [`WalkBuilder`] from the config (respecting
//! `.gitignore`, excluded directories, and excluded extensions), then delivers
//! discovered paths to the analysis pipeline in batches over a crossbeam channel.
//! Batching amortizes channel overhead for large trees.
//!
//! All paths are checked via [`crate::utils::path::path_stays_within_root`]
//! before entering a batch, preventing traversal outside the scan root.
use crate::utils::Config;
use crate::utils::path::path_stays_within_root;
use crossbeam_channel::{Receiver, Sender, bounded};