Python fp and docs updtes (#58)

* refactor: Update comments for clarity and add expectations.json files for performance metrics

* feat: Implement FP guard for JS/TS local-collection receivers to suppress missing ownership checks

* feat: Enhance Rust parameter handling to classify local collections and prevent false ownership checks

* refactor: Simplify code formatting for better readability in multiple files

* refactor: Improve UTF-8 sequence length handling and enhance clarity in loop iteration

* feat: Update Java and Python patterns to include new security rules

* refactor: Improve comment clarity and consistency across multiple Rust files

* refactor: Simplify code formatting for improved readability in integration tests and module files

* refactor: Improve comment formatting and enhance clarity in assertions across multiple files
This commit is contained in:
Eli Peter 2026-04-29 19:53:34 -04:00 committed by GitHub
parent 4db0805de6
commit a438886217
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
291 changed files with 9485 additions and 3851 deletions

View file

@ -43,7 +43,7 @@ pub fn scan_ejs_file(path: &Path, bytes: &[u8]) -> Vec<Diag> {
// Advance past this match for the next iteration.
search_from = abs_end + 2; // skip "%>"
// Skip <%- include(...) %> EJS partial inclusion, not user-controlled.
// Skip <%- include(...) %>, EJS partial inclusion, not user-controlled.
if is_include_call(expr) {
continue;
}

View file

@ -12,7 +12,7 @@ pub const PATTERNS: &[Pattern] = &[
Pattern {
id: "java.deser.readobject",
description: "ObjectInputStream.readObject() performs unsafe deserialization",
// Match any .readObject() call the method name is specific enough.
// Match any .readObject() call, the method name is specific enough.
query: r#"(method_invocation
name: (identifier) @id (#eq? @id "readObject"))
@vuln"#,
@ -21,6 +21,46 @@ pub const PATTERNS: &[Pattern] = &[
category: PatternCategory::Deserialization,
confidence: Confidence::High,
},
// ── Tier A: SnakeYAML deserialization (CVE-2022-1471) ──────────────
// `new Yaml()` constructed without a `SafeConstructor` argument
// accepts arbitrary YAML tags (`!!javax.script.ScriptEngineManager`,
// `!!java.net.URLClassLoader`, …) and instantiates any class via
// reflection. SnakeYAML 2.0 swapped the default to SafeConstructor
// but pre-2.0 deployments stay vulnerable until call sites are
// patched. We match the empty-arg form `new Yaml()` only, so the
// explicit-SafeConstructor remediation form
// `new Yaml(new SafeConstructor(new LoaderOptions()))` is silent.
Pattern {
id: "java.deser.snakeyaml_unsafe_constructor",
description: "new Yaml() without SafeConstructor accepts arbitrary class tags (CVE-2022-1471)",
query: r#"(object_creation_expression
type: (type_identifier) @t (#eq? @t "Yaml")
arguments: (argument_list) @args (#eq? @args "()"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::Deserialization,
confidence: Confidence::High,
},
// ── Tier A: Apache Commons Text Text4Shell (CVE-2022-42889) ────────
// `StringSubstitutor.createInterpolator()` enables `script:`,
// `dns:`, and `url:` lookups by default, `${script:js:…}`
// evaluates JavaScript via the JSR-223 ScriptEngineManager. The
// factory call is itself the structural bug; the recommended app-
// side mitigation builds a `StringSubstitutor` directly with a
// restricted lookup map.
Pattern {
id: "java.code_exec.text4shell_interpolator",
description: "StringSubstitutor.createInterpolator() enables script:/dns:/url: evaluation (CVE-2022-42889)",
query: r#"(method_invocation
object: (identifier) @c (#eq? @c "StringSubstitutor")
name: (identifier) @id (#eq? @id "createInterpolator"))
@vuln"#,
severity: Severity::High,
tier: PatternTier::A,
category: PatternCategory::CodeExec,
confidence: Confidence::High,
},
// ── Tier A: Command execution ──────────────────────────────────────
Pattern {
id: "java.cmdi.runtime_exec",

View file

@ -1,42 +1,4 @@
//! # AST Pattern Conventions
//!
//! Each language file exports a `PATTERNS` slice of [`Pattern`] structs.
//!
//! ## ID format
//!
//! `<lang>.<category>.<specific>` — e.g. `java.deser.readobject`, `py.cmdi.os_system`.
//!
//! Language prefixes: `rs`, `java`, `py`, `js`, `ts`, `c`, `cpp`, `go`, `php`, `rb`.
//!
//! ## Tiers
//!
//! * **Tier A** — structural presence is high-signal (e.g. `gets()`, `eval()`).
//! * **Tier B** — requires a heuristic guard in the query (e.g. SQL with concatenated
//! arg, format-string with variable first arg).
//!
//! ## Severity
//!
//! * **High** — command exec, deserialization, banned C functions.
//! * **Medium** — SQL concat, reflection, XSS sinks, casts.
//! * **Low** — weak crypto, insecure randomness, code-quality (`unwrap`/`expect`/`panic`).
//!
//! Note: the default `min_severity` filter skips Low patterns; they only appear when
//! the user explicitly lowers the threshold.
//!
//! ## No-duplicate rule
//!
//! If a vulnerability class is already detected by taint analysis (e.g. `eval` as a
//! sink, `system` as a sink), the AST pattern is still kept for `--ast-only` mode but
//! uses a distinct ID namespace (`js.code_exec.eval` vs `taint-unsanitised-flow`).
//! The dedup pass in `ast.rs` prevents exact-duplicate findings at the same location.
//!
//! ## Adding a new pattern
//!
//! 1. Pick the language file under `src/patterns/<lang>.rs`.
//! 2. Choose tier, category, severity per the rules above.
//! 3. Write the tree-sitter query — test with `cargo test --test pattern_tests`.
//! 4. Add a snippet to `tests/fixtures/patterns/<lang>/positive.<ext>`.
//! 5. Add the ID to the positive test assertion in `tests/pattern_tests.rs`.
#![doc = include_str!(concat!(env!("OUT_DIR"), "/patterns.md"))]
pub mod c;
pub mod cpp;
@ -68,7 +30,7 @@ pub enum Severity {
impl Severity {
/// Bracketed, colored, fixed-width tag for aligned console output.
///
/// Returns e.g. `"[HIGH] "` or `"[MEDIUM]"` always 8 visible characters
/// Returns e.g. `"[HIGH] "` or `"[MEDIUM]"`, always 8 visible characters
/// so the column after the tag lines up regardless of severity.
#[allow(dead_code)] // public API for lib consumers
pub fn colored_tag(self) -> String {
@ -123,9 +85,9 @@ impl FromStr for Severity {
/// A parsed severity filter expression.
///
/// Supports three forms:
/// - Single level: `"HIGH"` matches only that level
/// - Comma list: `"HIGH,MEDIUM"` matches any listed level
/// - Threshold: `">=MEDIUM"` matches that level and above
/// - Single level: `"HIGH"`, matches only that level
/// - Comma list: `"HIGH,MEDIUM"`, matches any listed level
/// - Threshold: `">=MEDIUM"`, matches that level and above
///
/// Parsing is case-insensitive and tolerates whitespace around tokens.
#[derive(Debug, Clone, PartialEq, Eq)]
@ -242,7 +204,7 @@ impl PatternCategory {
/// One AST pattern with a tree-sitter query and meta-data.
#[derive(Debug, Clone, Serialize, PartialEq)]
pub struct Pattern {
/// Unique identifier `<lang>.<category>.<specific>` preferred.
/// Unique identifier, `<lang>.<category>.<specific>` preferred.
pub id: &'static str,
/// Human-readable explanation.
pub description: &'static str,

View file

@ -5,7 +5,7 @@ use crate::patterns::{Pattern, PatternCategory, PatternTier, Severity};
///
/// Taint rules cover `eval`/`exec`, `os.system`/`os.popen`/`subprocess.*`,
/// and `cursor.execute`. AST patterns here add coverage for **deserialization**,
/// **subprocess shell=True** (Tier B taint doesn't check keyword args), and
/// **subprocess shell=True** (Tier B, taint doesn't check keyword args), and
/// **code execution** sinks that taint cannot structurally verify.
pub const PATTERNS: &[Pattern] = &[
// ── Tier A: Code execution ─────────────────────────────────────────
@ -121,14 +121,45 @@ pub const PATTERNS: &[Pattern] = &[
confidence: Confidence::High,
},
// ── Tier B: SQL injection (format/concat heuristic) ────────────────
// Catches both `cursor.execute(query + user)` (binary_operator concat)
// and `cursor.execute(f"... {user} ...")` (f-string with interpolation).
// f-strings appear as a `string` node with `interpolation` children in
// tree-sitter-python; the alternation lets the same pattern cover both
// the historical % / + concat shapes and the modern f-string SQLi shape
// that surfaces in CVE-2025-24793 (snowflake-connector-python),
// CVE-2025-69662 (geopandas), and dozens of similar cursor.execute
// call sites across the corpus.
Pattern {
id: "py.sqli.execute_format",
description: "cursor.execute with string concatenation risks SQL injection",
description: "cursor.execute with string concatenation or f-string risks SQL injection",
query: r#"(call
function: (attribute
attribute: (identifier) @fn (#eq? @fn "execute"))
arguments: (argument_list
(binary_operator) @arg))
[(binary_operator)
(string (interpolation))] @arg))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::B,
category: PatternCategory::SqlInjection,
confidence: Confidence::Medium,
},
// SQLAlchemy `text(<concat-or-fstring>)`, same Tier B heuristic
// applied to the SQLAlchemy raw-SQL constructor. Catches the
// CVE-2025-69662 (geopandas) shape:
// connection.execute(text(f"SELECT … '{geom_name}' …"))
// where the f-string interpolation is the injection point and the
// surrounding `connection.execute` would otherwise hide the unsafe
// construction from the simple execute_format pattern.
Pattern {
id: "py.sqli.text_format",
description: "sqlalchemy text() with f-string or string concat risks SQL injection",
query: r#"(call
function: [(identifier) @fn (attribute attribute: (identifier) @fn)]
(#eq? @fn "text")
arguments: (argument_list
[(binary_operator)
(string (interpolation))] @arg))
@vuln"#,
severity: Severity::Medium,
tier: PatternTier::B,