Added experimental control flow analysis and syntax classification for rust lang (#22)

* Introduce control flow graph (CFG) support:

- Added `cfg.rs` with CFG generation and analysis utilities.
- Integrated `petgraph` library for graph-based computations.
- Updated `ast.rs` to utilize CFG for function analysis.
- Modified `Cargo.toml` and `Cargo.lock` to include new dependencies.
- Improved static analysis with taint tracking through CFG paths.

* feat: enhance control flow analysis with taint tracking and node labeling

* feat: improve control flow graph with enhanced node handling and new tests

* Remove unnecessary reference marker in `byte_offset_to_point` comment.

* Remove unnecessary reference marker in `byte_offset_to_point` comment.

* Refactor `ast.rs` for performance and clarity; enhance `cfg.rs` with recursive CFG generation and improved classification logic for AST analysis.

* Refactor CFG and taint tracking logic:

- Enhanced `cfg.rs` with inline helper function `text_of` for cleaner UTF-8 handling in AST nodes.
- Expanded `labels.rs` rules with detailed `Sources`, `Sanitizers`, and `Sinks` for improved classification.
- Refined `push_node` to handle method call expressions with object-function pairing.
- Simplified code handling in trivia skipping and debug-only logic.

* Enhance `cfg.rs` with `first_call_ident` helper and improve identifier extraction logic in `push_node`.

* Add targeted CFG taint-tracking tests to enhance analysis coverage.

* Enhance CFG generation with loop expression handling and improve taint tracking logic. Add new sanitization example in `examples/sanitize/example.rs`.

* Update README with installation instructions for Cargo and GitHub releases.

* Expand taint-tracking with precise `def-use` computation and enhance `labels.rs` for detailed classification. Extend `examples/sanitize` with realistic scenarios demonstrating new rules.

* Refactor `labels.rs`:

- Removed redundant `LabelRule` entries for cleaner rule definitions.
- Adjusted matching logic to prioritize suffix and prefix matches effectively.

* Refactor `labels.rs`:

- Removed redundant `LabelRule` entries for cleaner rule definitions.
- Adjusted matching logic to prioritize suffix and prefix matches effectively.

* Add test for taint tracking with multiple sources in `cfg.rs`.

* Add `function_summaries` table and implement summary upsert/load methods. Refactor to handle summary storage and retrieval efficiently, with placeholder clean/drop logic.

* refactor: split `labels.rs` into modular structure with language-specific files

* refactor: split `labels.rs` into modular structure with language-specific files

* refactor: clean up SQL table definitions in `database.rs` for better readability

* refactor: simplify CFG structure by removing lifetime parameters and enhancing taint metadata handling

* refactor: update TODO comments in `cfg.rs` to clarify future enhancements for cap labels and function details

* refactor: remove redundant header from README.md for improved clarity

* feat: add PHF-based syntax classifiers and Kind enum for efficient syntax mapping across languages

* feat: introduce analysis modes for enhanced scanner configuration and diagnostics

* feat: define Kind enum for syntax classification in control flow analysis

* feat: bump version to 0.2.0-alpha and update CHANGELOG for new features and fixes

* refactor: clean up imports and formatting in AST and CFG modules for improved readability

* refactor: simplify function signatures and improve code readability in CFG and module files

* fix: correct rayon_thread_stack_size comment to reflect actual value of 8 MiB

* refactor: update string formatting in clean and project modules for consistency

* refactor: fix indentation in clean.rs for improved readability

---------

Co-authored-by: elipeter <eli.peter@es.fcm.travel>
This commit is contained in:
Eli Peter 2025-06-28 17:36:14 +02:00 committed by GitHub
parent fd65360818
commit 3c21efba75
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 1585 additions and 79 deletions

View file

@ -1,5 +1,8 @@
use crate::cfg::{analyse_function, build_cfg};
use crate::commands::scan::Diag;
use crate::errors::{NyxError, NyxResult};
use crate::patterns::Severity;
use crate::utils::config::AnalysisMode;
use crate::utils::ext::lowercase_ext;
use crate::utils::{Config, query_cache};
use std::cell::RefCell;
@ -10,6 +13,16 @@ thread_local! {
static PARSER: RefCell<tree_sitter::Parser> = RefCell::new(tree_sitter::Parser::new());
}
/// Convenience alias for node indices.
fn byte_offset_to_point(tree: &tree_sitter::Tree, byte: usize) -> tree_sitter::Point {
// `descendant_for_byte_range` gives us *some* node that starts at `byte`,
// `start_position` turns that into rows & columns (both 0-based)
tree.root_node()
.descendant_for_byte_range(byte, byte)
.map(|n| n.start_position())
.unwrap_or_else(|| tree_sitter::Point { row: 0, column: 0 })
}
pub(crate) fn run_rules_on_file(path: &Path, cfg: &Config) -> NyxResult<Vec<Diag>> {
tracing::debug!("Running rules on: {}", path.display());
let bytes = std::fs::read(path)?;
@ -47,30 +60,58 @@ pub(crate) fn run_rules_on_file(path: &Path, cfg: &Config) -> NyxResult<Vec<Diag
.ok_or_else(|| NyxError::Other("tree-sitter failed".into()))
})?;
let root = _tree.root_node();
let compiled = query_cache::for_lang(lang_slug, ts_lang);
let mut cursor = QueryCursor::new();
let mut out = Vec::new();
for cq in compiled.iter() {
if cfg.scanner.min_severity <= cq.meta.severity {
continue;
if cfg.scanner.mode == AnalysisMode::Full || cfg.scanner.mode == AnalysisMode::Taint {
tracing::debug!("Running taint analysis on: {}", path.display());
let (cfg_graph, entry) = build_cfg(&_tree, &bytes, lang_slug);
for p in analyse_function(&cfg_graph, entry) {
let src_byte = cfg_graph[p.first().copied().unwrap()].span.0;
let point = byte_offset_to_point(&_tree, src_byte);
out.push(Diag {
path: path.to_string_lossy().into_owned(),
line: point.row + 1,
col: point.column + 1,
severity: Severity::High,
id: "taint-unsanitised-flow".into(),
});
}
let mut matches = cursor.matches(&cq.query, root, &*bytes);
while let Some(m) = matches.next() {
if let Some(cap) = m.captures.iter().find(|c| c.index == 0) {
let point = cap.node.start_position();
out.push(Diag {
path: path.to_string_lossy().into_owned(),
line: point.row + 1,
col: point.column + 1,
severity: cq.meta.severity,
id: cq.meta.id.to_owned(),
});
}
if cfg.scanner.mode == AnalysisMode::Full || cfg.scanner.mode == AnalysisMode::Ast {
let root = _tree.root_node();
let compiled = query_cache::for_lang(lang_slug, ts_lang);
let mut cursor = QueryCursor::new();
for cq in compiled.iter() {
if cfg.scanner.min_severity <= cq.meta.severity {
continue;
}
let mut matches = cursor.matches(&cq.query, root, &*bytes);
while let Some(m) = matches.next() {
if let Some(cap) = m.captures.iter().find(|c| c.index == 0) {
let point = cap.node.start_position();
out.push(Diag {
path: path.to_string_lossy().into_owned(),
line: point.row + 1,
col: point.column + 1,
severity: cq.meta.severity,
id: cq.meta.id.to_owned(),
});
}
}
}
}
// Check to ensure no duplicates (DOUBLE-CHECK EFFICIENCY)
out.sort_by(|a, b| (a.line, a.col, &a.id, a.severity).cmp(&(b.line, b.col, &b.id, b.severity)));
out.dedup_by(|a, b| {
a.line == b.line && a.col == b.col && a.id == b.id && a.severity == b.severity
});
Ok(out)
}