nyx/src/patterns/mod.rs

396 lines
12 KiB
Rust
Raw Normal View History

//! # AST Pattern Conventions
//!
//! Each language file exports a `PATTERNS` slice of [`Pattern`] structs.
//!
//! ## ID format
//!
//! `<lang>.<category>.<specific>` — e.g. `java.deser.readobject`, `py.cmdi.os_system`.
//!
//! Language prefixes: `rs`, `java`, `py`, `js`, `ts`, `c`, `cpp`, `go`, `php`, `rb`.
//!
//! ## Tiers
//!
//! * **Tier A** — structural presence is high-signal (e.g. `gets()`, `eval()`).
//! * **Tier B** — requires a heuristic guard in the query (e.g. SQL with concatenated
//! arg, format-string with variable first arg).
//!
//! ## Severity
//!
//! * **High** — command exec, deserialization, banned C functions.
//! * **Medium** — SQL concat, reflection, XSS sinks, casts.
//! * **Low** — weak crypto, insecure randomness, code-quality (`unwrap`/`expect`/`panic`).
//!
//! Note: the default `min_severity` filter skips Low patterns; they only appear when
//! the user explicitly lowers the threshold.
//!
//! ## No-duplicate rule
//!
//! If a vulnerability class is already detected by taint analysis (e.g. `eval` as a
//! sink, `system` as a sink), the AST pattern is still kept for `--ast-only` mode but
//! uses a distinct ID namespace (`js.code_exec.eval` vs `taint-unsanitised-flow`).
//! The dedup pass in `ast.rs` prevents exact-duplicate findings at the same location.
//!
//! ## Adding a new pattern
//!
//! 1. Pick the language file under `src/patterns/<lang>.rs`.
//! 2. Choose tier, category, severity per the rules above.
//! 3. Write the tree-sitter query — test with `cargo test --test pattern_tests`.
//! 4. Add a snippet to `tests/fixtures/patterns/<lang>/positive.<ext>`.
//! 5. Add the ID to the positive test assertion in `tests/pattern_tests.rs`.
pub mod c;
pub mod cpp;
mod go;
mod java;
pub mod javascript;
mod php;
mod python;
mod ruby;
pub mod rust;
pub mod typescript;
use crate::evidence::Confidence;
use console::style;
use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::fmt;
use std::str::FromStr;
#[derive(Debug, Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Serialize, Deserialize)]
pub enum Severity {
High,
Medium,
Low,
}
impl Severity {
/// Bracketed, colored, fixed-width tag for aligned console output.
///
/// Returns e.g. `"[HIGH] "` or `"[MEDIUM]"` — always 8 visible characters
/// so the column after the tag lines up regardless of severity.
#[allow(dead_code)] // public API for lib consumers
pub fn colored_tag(self) -> String {
// Visible widths: "[HIGH]" = 6, "[MEDIUM]" = 8, "[LOW]" = 5.
// Pad the *whole* tag to 8 visible chars (the longest, "[MEDIUM]").
let (label, styled_fn): (&str, fn(&str) -> String) = match self {
Severity::High => ("HIGH", |s| style(s).red().bold().to_string()),
Severity::Medium => ("MEDIUM", |s| style(s).color256(208).bold().to_string()),
Severity::Low => ("LOW", |s| style(s).color256(67).to_string()),
};
let bracket_len = label.len() + 2; // "[" + label + "]"
let pad = 8usize.saturating_sub(bracket_len);
format!("[{}]{:pad$}", styled_fn(label), "", pad = pad)
}
}
impl fmt::Display for Severity {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let styled = match *self {
Severity::High => style("HIGH").red().bold().to_string(),
Severity::Medium => style("MEDIUM").color256(208).bold().to_string(),
Severity::Low => style("LOW").color256(67).to_string(),
};
f.write_str(&styled)
}
}
impl Severity {
/// Textual value stored in SQLite.
pub fn as_db_str(self) -> &'static str {
match self {
Severity::High => "HIGH",
Severity::Medium => "MEDIUM",
Severity::Low => "LOW",
}
}
}
impl FromStr for Severity {
type Err = String;
fn from_str(input: &str) -> Result<Self, Self::Err> {
match input.trim().to_ascii_uppercase().as_str() {
"HIGH" => Ok(Severity::High),
"MEDIUM" | "MED" => Ok(Severity::Medium),
"LOW" => Ok(Severity::Low),
other => Err(format!("unknown severity: '{other}'")),
}
}
}
/// A parsed severity filter expression.
///
/// Supports three forms:
/// - Single level: `"HIGH"` — matches only that level
/// - Comma list: `"HIGH,MEDIUM"` — matches any listed level
/// - Threshold: `">=MEDIUM"` — matches that level and above
///
/// Parsing is case-insensitive and tolerates whitespace around tokens.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum SeverityFilter {
/// Match findings at or above this level (High >= Medium >= Low).
AtLeast(Severity),
/// Match findings whose severity is in this exact set.
AnyOf(Vec<Severity>),
}
impl SeverityFilter {
/// Parse a severity filter expression.
///
/// Examples: `"HIGH"`, `"high,medium"`, `">=MEDIUM"`, `">= low"`.
pub fn parse(expr: &str) -> Result<Self, String> {
let trimmed = expr.trim();
if trimmed.is_empty() {
return Err("empty severity expression".into());
}
// Threshold form: >=LEVEL
if let Some(rest) = trimmed.strip_prefix(">=") {
let level: Severity = rest.parse()?;
return Ok(SeverityFilter::AtLeast(level));
}
// Comma-separated list (also handles single value)
let levels: Result<Vec<Severity>, String> = trimmed
.split(',')
.map(|tok| tok.trim().parse::<Severity>())
.collect();
let levels = levels?;
if levels.is_empty() {
return Err("empty severity expression".into());
}
// Optimise single-value list
if levels.len() == 1 {
return Ok(SeverityFilter::AnyOf(levels));
}
Ok(SeverityFilter::AnyOf(levels))
}
/// Returns `true` if the given severity passes this filter.
pub fn matches(&self, sev: Severity) -> bool {
match self {
SeverityFilter::AtLeast(threshold) => {
// Severity ordering: High < Medium < Low (derived Ord).
// "at least Medium" means sev <= Medium in Ord terms.
sev <= *threshold
}
SeverityFilter::AnyOf(set) => set.contains(&sev),
}
}
}
/// Pattern confidence tier.
///
/// * **A** Structural presence alone is high-signal (e.g. `gets()`, `eval()`).
/// * **B** Requires a simple heuristic guard in the query (e.g. SQL with
/// concatenated arg, file-open with non-literal path).
#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)]
pub enum PatternTier {
A,
B,
}
/// High-level finding category for noise reduction and prioritization.
#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash, Serialize, Deserialize)]
pub enum FindingCategory {
Security,
Reliability,
Quality,
}
impl std::fmt::Display for FindingCategory {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
FindingCategory::Security => write!(f, "Security"),
FindingCategory::Reliability => write!(f, "Reliability"),
FindingCategory::Quality => write!(f, "Quality"),
}
}
}
/// Vulnerability class that a pattern detects.
#[derive(Debug, Copy, Clone, Eq, PartialEq, Serialize, Deserialize)]
pub enum PatternCategory {
CommandExec,
CodeExec,
Deserialization,
SqlInjection,
PathTraversal,
Xss,
Crypto,
Secrets,
InsecureTransport,
Reflection,
MemorySafety,
Prototype,
CodeQuality,
}
impl PatternCategory {
/// Map this vulnerability class to a high-level finding category.
pub fn finding_category(self) -> FindingCategory {
match self {
PatternCategory::CodeQuality => FindingCategory::Quality,
_ => FindingCategory::Security,
}
}
}
/// One AST pattern with a tree-sitter query and meta-data.
#[derive(Debug, Clone, Serialize, PartialEq)]
pub struct Pattern {
/// Unique identifier — `<lang>.<category>.<specific>` preferred.
pub id: &'static str,
/// Human-readable explanation.
pub description: &'static str,
/// tree-sitter query string.
pub query: &'static str,
/// Rough severity bucket.
pub severity: Severity,
/// Confidence tier (A = structural, B = heuristic-guarded).
pub tier: PatternTier,
/// Vulnerability class.
pub category: PatternCategory,
/// Confidence level for findings produced by this pattern.
pub confidence: Confidence,
}
/// Global, lazily-initialised registry: lang-name → pattern slice
static REGISTRY: Lazy<HashMap<&'static str, &'static [Pattern]>> = Lazy::new(|| {
let mut m = HashMap::new();
// ---- Rust ----
m.insert("rust", rust::PATTERNS);
// ---- TypeScript ----
m.insert("typescript", typescript::PATTERNS);
m.insert("ts", typescript::PATTERNS);
m.insert("tsx", typescript::PATTERNS);
// ---- JavaScript ----
m.insert("javascript", javascript::PATTERNS);
m.insert("js", javascript::PATTERNS);
// ---- C & C++ ----
m.insert("c", c::PATTERNS);
m.insert("cpp", cpp::PATTERNS);
m.insert("c++", cpp::PATTERNS);
Added experimental control flow analysis and syntax classification for rust lang (#22) * Introduce control flow graph (CFG) support: - Added `cfg.rs` with CFG generation and analysis utilities. - Integrated `petgraph` library for graph-based computations. - Updated `ast.rs` to utilize CFG for function analysis. - Modified `Cargo.toml` and `Cargo.lock` to include new dependencies. - Improved static analysis with taint tracking through CFG paths. * feat: enhance control flow analysis with taint tracking and node labeling * feat: improve control flow graph with enhanced node handling and new tests * Remove unnecessary reference marker in `byte_offset_to_point` comment. * Remove unnecessary reference marker in `byte_offset_to_point` comment. * Refactor `ast.rs` for performance and clarity; enhance `cfg.rs` with recursive CFG generation and improved classification logic for AST analysis. * Refactor CFG and taint tracking logic: - Enhanced `cfg.rs` with inline helper function `text_of` for cleaner UTF-8 handling in AST nodes. - Expanded `labels.rs` rules with detailed `Sources`, `Sanitizers`, and `Sinks` for improved classification. - Refined `push_node` to handle method call expressions with object-function pairing. - Simplified code handling in trivia skipping and debug-only logic. * Enhance `cfg.rs` with `first_call_ident` helper and improve identifier extraction logic in `push_node`. * Add targeted CFG taint-tracking tests to enhance analysis coverage. * Enhance CFG generation with loop expression handling and improve taint tracking logic. Add new sanitization example in `examples/sanitize/example.rs`. * Update README with installation instructions for Cargo and GitHub releases. * Expand taint-tracking with precise `def-use` computation and enhance `labels.rs` for detailed classification. Extend `examples/sanitize` with realistic scenarios demonstrating new rules. * Refactor `labels.rs`: - Removed redundant `LabelRule` entries for cleaner rule definitions. - Adjusted matching logic to prioritize suffix and prefix matches effectively. * Refactor `labels.rs`: - Removed redundant `LabelRule` entries for cleaner rule definitions. - Adjusted matching logic to prioritize suffix and prefix matches effectively. * Add test for taint tracking with multiple sources in `cfg.rs`. * Add `function_summaries` table and implement summary upsert/load methods. Refactor to handle summary storage and retrieval efficiently, with placeholder clean/drop logic. * refactor: split `labels.rs` into modular structure with language-specific files * refactor: split `labels.rs` into modular structure with language-specific files * refactor: clean up SQL table definitions in `database.rs` for better readability * refactor: simplify CFG structure by removing lifetime parameters and enhancing taint metadata handling * refactor: update TODO comments in `cfg.rs` to clarify future enhancements for cap labels and function details * refactor: remove redundant header from README.md for improved clarity * feat: add PHF-based syntax classifiers and Kind enum for efficient syntax mapping across languages * feat: introduce analysis modes for enhanced scanner configuration and diagnostics * feat: define Kind enum for syntax classification in control flow analysis * feat: bump version to 0.2.0-alpha and update CHANGELOG for new features and fixes * refactor: clean up imports and formatting in AST and CFG modules for improved readability * refactor: simplify function signatures and improve code readability in CFG and module files * fix: correct rayon_thread_stack_size comment to reflect actual value of 8 MiB * refactor: update string formatting in clean and project modules for consistency * refactor: fix indentation in clean.rs for improved readability --------- Co-authored-by: elipeter <eli.peter@es.fcm.travel>
2025-06-28 17:36:14 +02:00
// ---- Other patterns in the folder ----
m.insert("java", java::PATTERNS);
m.insert("go", go::PATTERNS);
m.insert("php", php::PATTERNS);
m.insert("python", python::PATTERNS);
m.insert("py", python::PATTERNS);
m.insert("ruby", ruby::PATTERNS);
m.insert("rb", ruby::PATTERNS);
Added experimental control flow analysis and syntax classification for rust lang (#22) * Introduce control flow graph (CFG) support: - Added `cfg.rs` with CFG generation and analysis utilities. - Integrated `petgraph` library for graph-based computations. - Updated `ast.rs` to utilize CFG for function analysis. - Modified `Cargo.toml` and `Cargo.lock` to include new dependencies. - Improved static analysis with taint tracking through CFG paths. * feat: enhance control flow analysis with taint tracking and node labeling * feat: improve control flow graph with enhanced node handling and new tests * Remove unnecessary reference marker in `byte_offset_to_point` comment. * Remove unnecessary reference marker in `byte_offset_to_point` comment. * Refactor `ast.rs` for performance and clarity; enhance `cfg.rs` with recursive CFG generation and improved classification logic for AST analysis. * Refactor CFG and taint tracking logic: - Enhanced `cfg.rs` with inline helper function `text_of` for cleaner UTF-8 handling in AST nodes. - Expanded `labels.rs` rules with detailed `Sources`, `Sanitizers`, and `Sinks` for improved classification. - Refined `push_node` to handle method call expressions with object-function pairing. - Simplified code handling in trivia skipping and debug-only logic. * Enhance `cfg.rs` with `first_call_ident` helper and improve identifier extraction logic in `push_node`. * Add targeted CFG taint-tracking tests to enhance analysis coverage. * Enhance CFG generation with loop expression handling and improve taint tracking logic. Add new sanitization example in `examples/sanitize/example.rs`. * Update README with installation instructions for Cargo and GitHub releases. * Expand taint-tracking with precise `def-use` computation and enhance `labels.rs` for detailed classification. Extend `examples/sanitize` with realistic scenarios demonstrating new rules. * Refactor `labels.rs`: - Removed redundant `LabelRule` entries for cleaner rule definitions. - Adjusted matching logic to prioritize suffix and prefix matches effectively. * Refactor `labels.rs`: - Removed redundant `LabelRule` entries for cleaner rule definitions. - Adjusted matching logic to prioritize suffix and prefix matches effectively. * Add test for taint tracking with multiple sources in `cfg.rs`. * Add `function_summaries` table and implement summary upsert/load methods. Refactor to handle summary storage and retrieval efficiently, with placeholder clean/drop logic. * refactor: split `labels.rs` into modular structure with language-specific files * refactor: split `labels.rs` into modular structure with language-specific files * refactor: clean up SQL table definitions in `database.rs` for better readability * refactor: simplify CFG structure by removing lifetime parameters and enhancing taint metadata handling * refactor: update TODO comments in `cfg.rs` to clarify future enhancements for cap labels and function details * refactor: remove redundant header from README.md for improved clarity * feat: add PHF-based syntax classifiers and Kind enum for efficient syntax mapping across languages * feat: introduce analysis modes for enhanced scanner configuration and diagnostics * feat: define Kind enum for syntax classification in control flow analysis * feat: bump version to 0.2.0-alpha and update CHANGELOG for new features and fixes * refactor: clean up imports and formatting in AST and CFG modules for improved readability * refactor: simplify function signatures and improve code readability in CFG and module files * fix: correct rayon_thread_stack_size comment to reflect actual value of 8 MiB * refactor: update string formatting in clean and project modules for consistency * refactor: fix indentation in clean.rs for improved readability --------- Co-authored-by: elipeter <eli.peter@es.fcm.travel>
2025-06-28 17:36:14 +02:00
tracing::debug!("AST-pattern registry initialised ({} patterns)", m.len());
m
});
/// Return all patterns for the requested language (case-insensitive).
///
Added experimental control flow analysis and syntax classification for rust lang (#22) * Introduce control flow graph (CFG) support: - Added `cfg.rs` with CFG generation and analysis utilities. - Integrated `petgraph` library for graph-based computations. - Updated `ast.rs` to utilize CFG for function analysis. - Modified `Cargo.toml` and `Cargo.lock` to include new dependencies. - Improved static analysis with taint tracking through CFG paths. * feat: enhance control flow analysis with taint tracking and node labeling * feat: improve control flow graph with enhanced node handling and new tests * Remove unnecessary reference marker in `byte_offset_to_point` comment. * Remove unnecessary reference marker in `byte_offset_to_point` comment. * Refactor `ast.rs` for performance and clarity; enhance `cfg.rs` with recursive CFG generation and improved classification logic for AST analysis. * Refactor CFG and taint tracking logic: - Enhanced `cfg.rs` with inline helper function `text_of` for cleaner UTF-8 handling in AST nodes. - Expanded `labels.rs` rules with detailed `Sources`, `Sanitizers`, and `Sinks` for improved classification. - Refined `push_node` to handle method call expressions with object-function pairing. - Simplified code handling in trivia skipping and debug-only logic. * Enhance `cfg.rs` with `first_call_ident` helper and improve identifier extraction logic in `push_node`. * Add targeted CFG taint-tracking tests to enhance analysis coverage. * Enhance CFG generation with loop expression handling and improve taint tracking logic. Add new sanitization example in `examples/sanitize/example.rs`. * Update README with installation instructions for Cargo and GitHub releases. * Expand taint-tracking with precise `def-use` computation and enhance `labels.rs` for detailed classification. Extend `examples/sanitize` with realistic scenarios demonstrating new rules. * Refactor `labels.rs`: - Removed redundant `LabelRule` entries for cleaner rule definitions. - Adjusted matching logic to prioritize suffix and prefix matches effectively. * Refactor `labels.rs`: - Removed redundant `LabelRule` entries for cleaner rule definitions. - Adjusted matching logic to prioritize suffix and prefix matches effectively. * Add test for taint tracking with multiple sources in `cfg.rs`. * Add `function_summaries` table and implement summary upsert/load methods. Refactor to handle summary storage and retrieval efficiently, with placeholder clean/drop logic. * refactor: split `labels.rs` into modular structure with language-specific files * refactor: split `labels.rs` into modular structure with language-specific files * refactor: clean up SQL table definitions in `database.rs` for better readability * refactor: simplify CFG structure by removing lifetime parameters and enhancing taint metadata handling * refactor: update TODO comments in `cfg.rs` to clarify future enhancements for cap labels and function details * refactor: remove redundant header from README.md for improved clarity * feat: add PHF-based syntax classifiers and Kind enum for efficient syntax mapping across languages * feat: introduce analysis modes for enhanced scanner configuration and diagnostics * feat: define Kind enum for syntax classification in control flow analysis * feat: bump version to 0.2.0-alpha and update CHANGELOG for new features and fixes * refactor: clean up imports and formatting in AST and CFG modules for improved readability * refactor: simplify function signatures and improve code readability in CFG and module files * fix: correct rayon_thread_stack_size comment to reflect actual value of 8 MiB * refactor: update string formatting in clean and project modules for consistency * refactor: fix indentation in clean.rs for improved readability --------- Co-authored-by: elipeter <eli.peter@es.fcm.travel>
2025-06-28 17:36:14 +02:00
/// Unknown patterns yield an **empty** `Vec`.
pub fn load(lang: &str) -> Vec<Pattern> {
let key = lang.to_ascii_lowercase();
REGISTRY.get(key.as_str()).copied().unwrap_or(&[]).to_vec()
}
#[test]
fn severity_as_db_str_roundtrip() {
for &s in &[Severity::High, Severity::Medium, Severity::Low] {
let db = s.as_db_str();
assert!(matches!(db, "HIGH" | "MEDIUM" | "LOW"));
assert_eq!(db.parse::<Severity>().unwrap(), s);
assert_eq!(db.to_lowercase().parse::<Severity>().unwrap(), s);
}
}
#[test]
fn severity_display_contains_uppercase_name() {
assert!(Severity::High.to_string().contains("HIGH"));
assert!(Severity::Medium.to_string().contains("MEDIUM"));
assert!(Severity::Low.to_string().contains("LOW"));
}
#[test]
fn load_returns_correct_pattern_slices() {
let rust = load("rust");
assert!(!rust.is_empty(), "Rust patterns should be loaded");
let ts = load("typescript");
let tsx = load("tsx");
assert_eq!(ts, tsx, "alias tsx must map to TypeScript patterns");
assert_eq!(load("RUST"), rust);
assert!(load("brainfuck").is_empty());
}
#[test]
fn severity_from_str_rejects_unknown() {
assert!("garbage".parse::<Severity>().is_err());
}
#[test]
fn severity_filter_single() {
let f = SeverityFilter::parse("HIGH").unwrap();
assert!(f.matches(Severity::High));
assert!(!f.matches(Severity::Medium));
assert!(!f.matches(Severity::Low));
}
#[test]
fn severity_filter_comma_list() {
let f = SeverityFilter::parse("HIGH,MEDIUM").unwrap();
assert!(f.matches(Severity::High));
assert!(f.matches(Severity::Medium));
assert!(!f.matches(Severity::Low));
}
#[test]
fn severity_filter_threshold() {
let f = SeverityFilter::parse(">=MEDIUM").unwrap();
assert!(f.matches(Severity::High));
assert!(f.matches(Severity::Medium));
assert!(!f.matches(Severity::Low));
let f2 = SeverityFilter::parse(">=LOW").unwrap();
assert!(f2.matches(Severity::High));
assert!(f2.matches(Severity::Medium));
assert!(f2.matches(Severity::Low));
let f3 = SeverityFilter::parse(">=HIGH").unwrap();
assert!(f3.matches(Severity::High));
assert!(!f3.matches(Severity::Medium));
}
#[test]
fn severity_filter_case_insensitive_and_whitespace() {
let f = SeverityFilter::parse(" high , medium ").unwrap();
assert!(f.matches(Severity::High));
assert!(f.matches(Severity::Medium));
assert!(!f.matches(Severity::Low));
let f2 = SeverityFilter::parse(">= medium").unwrap();
assert!(f2.matches(Severity::High));
assert!(f2.matches(Severity::Medium));
}
#[test]
fn severity_filter_rejects_empty() {
assert!(SeverityFilter::parse("").is_err());
assert!(SeverityFilter::parse(" ").is_err());
}
#[test]
fn severity_filter_rejects_invalid_level() {
assert!(SeverityFilter::parse("CRITICAL").is_err());
assert!(SeverityFilter::parse("HIGH,CRITICAL").is_err());
assert!(SeverityFilter::parse(">=BOGUS").is_err());
}