Python fp and docs updtes (#58)

* refactor: Update comments for clarity and add expectations.json files for performance metrics * feat: Implement FP guard for JS/TS local-collection receivers to suppress missing ownership checks * feat: Enhance Rust parameter handling to classify local collections and prevent false ownership checks * refactor: Simplify code formatting for better readability in multiple files * refactor: Improve UTF-8 sequence length handling and enhance clarity in loop iteration * feat: Update Java and Python patterns to include new security rules * refactor: Improve comment clarity and consistency across multiple Rust files * refactor: Simplify code formatting for improved readability in integration tests and module files * refactor: Improve comment formatting and enhance clarity in assertions across multiple files
2026-06-09 19:45:13 +02:00 · 2026-04-29 19:53:34 -04:00 · 2026-04-29 19:53:34 -04:00 · a438886217
commit a438886217
parent 4db0805de6
291 changed files with 9485 additions and 3851 deletions
--- a/src/patterns/ejs.rs
+++ b/src/patterns/ejs.rs
@ -43,7 +43,7 @@ pub fn scan_ejs_file(path: &Path, bytes: &[u8]) -> Vec<Diag> {
            // Advance past this match for the next iteration.
            search_from = abs_end + 2; // skip "%>"

-            // Skip <%- include(...) %> — EJS partial inclusion, not user-controlled.
+            // Skip <%- include(...) %>, EJS partial inclusion, not user-controlled.
            if is_include_call(expr) {
                continue;
            }
--- a/src/patterns/java.rs
+++ b/src/patterns/java.rs
@ -12,7 +12,7 @@ pub const PATTERNS: &[Pattern] = &[
    Pattern {
        id: "java.deser.readobject",
        description: "ObjectInputStream.readObject() performs unsafe deserialization",
-        // Match any .readObject() call — the method name is specific enough.
+        // Match any .readObject() call, the method name is specific enough.
        query: r#"(method_invocation
                     name: (identifier) @id (#eq? @id "readObject"))
                   @vuln"#,
@ -21,6 +21,46 @@ pub const PATTERNS: &[Pattern] = &[
        category: PatternCategory::Deserialization,
        confidence: Confidence::High,
    },
+    // ── Tier A: SnakeYAML deserialization (CVE-2022-1471) ──────────────
+    // `new Yaml()` constructed without a `SafeConstructor` argument
+    // accepts arbitrary YAML tags (`!!javax.script.ScriptEngineManager`,
+    // `!!java.net.URLClassLoader`, …) and instantiates any class via
+    // reflection. SnakeYAML 2.0 swapped the default to SafeConstructor
+    // but pre-2.0 deployments stay vulnerable until call sites are
+    // patched. We match the empty-arg form `new Yaml()` only, so the
+    // explicit-SafeConstructor remediation form
+    // `new Yaml(new SafeConstructor(new LoaderOptions()))` is silent.
+    Pattern {
+        id: "java.deser.snakeyaml_unsafe_constructor",
+        description: "new Yaml() without SafeConstructor accepts arbitrary class tags (CVE-2022-1471)",
+        query: r#"(object_creation_expression
+                     type: (type_identifier) @t (#eq? @t "Yaml")
+                     arguments: (argument_list) @args (#eq? @args "()"))
+                   @vuln"#,
+        severity: Severity::High,
+        tier: PatternTier::A,
+        category: PatternCategory::Deserialization,
+        confidence: Confidence::High,
+    },
+    // ── Tier A: Apache Commons Text Text4Shell (CVE-2022-42889) ────────
+    // `StringSubstitutor.createInterpolator()` enables `script:`,
+    // `dns:`, and `url:` lookups by default, `${script:js:…}`
+    // evaluates JavaScript via the JSR-223 ScriptEngineManager. The
+    // factory call is itself the structural bug; the recommended app-
+    // side mitigation builds a `StringSubstitutor` directly with a
+    // restricted lookup map.
+    Pattern {
+        id: "java.code_exec.text4shell_interpolator",
+        description: "StringSubstitutor.createInterpolator() enables script:/dns:/url: evaluation (CVE-2022-42889)",
+        query: r#"(method_invocation
+                     object: (identifier) @c (#eq? @c "StringSubstitutor")
+                     name: (identifier) @id (#eq? @id "createInterpolator"))
+                   @vuln"#,
+        severity: Severity::High,
+        tier: PatternTier::A,
+        category: PatternCategory::CodeExec,
+        confidence: Confidence::High,
+    },
    // ── Tier A: Command execution ──────────────────────────────────────
    Pattern {
        id: "java.cmdi.runtime_exec",
--- a/src/patterns/mod.rs
+++ b/src/patterns/mod.rs
@ -1,42 +1,4 @@
-//! # AST Pattern Conventions
-//!
-//! Each language file exports a `PATTERNS` slice of [`Pattern`] structs.
-//!
-//! ## ID format
-//!
-//! `<lang>.<category>.<specific>` — e.g. `java.deser.readobject`, `py.cmdi.os_system`.
-//!
-//! Language prefixes: `rs`, `java`, `py`, `js`, `ts`, `c`, `cpp`, `go`, `php`, `rb`.
-//!
-//! ## Tiers
-//!
-//! * **Tier A** — structural presence is high-signal (e.g. `gets()`, `eval()`).
-//! * **Tier B** — requires a heuristic guard in the query (e.g. SQL with concatenated
-//!   arg, format-string with variable first arg).
-//!
-//! ## Severity
-//!
-//! * **High** — command exec, deserialization, banned C functions.
-//! * **Medium** — SQL concat, reflection, XSS sinks, casts.
-//! * **Low** — weak crypto, insecure randomness, code-quality (`unwrap`/`expect`/`panic`).
-//!
-//! Note: the default `min_severity` filter skips Low patterns; they only appear when
-//! the user explicitly lowers the threshold.
-//!
-//! ## No-duplicate rule
-//!
-//! If a vulnerability class is already detected by taint analysis (e.g. `eval` as a
-//! sink, `system` as a sink), the AST pattern is still kept for `--ast-only` mode but
-//! uses a distinct ID namespace (`js.code_exec.eval` vs `taint-unsanitised-flow`).
-//! The dedup pass in `ast.rs` prevents exact-duplicate findings at the same location.
-//!
-//! ## Adding a new pattern
-//!
-//! 1. Pick the language file under `src/patterns/<lang>.rs`.
-//! 2. Choose tier, category, severity per the rules above.
-//! 3. Write the tree-sitter query — test with `cargo test --test pattern_tests`.
-//! 4. Add a snippet to `tests/fixtures/patterns/<lang>/positive.<ext>`.
-//! 5. Add the ID to the positive test assertion in `tests/pattern_tests.rs`.
+#![doc = include_str!(concat!(env!("OUT_DIR"), "/patterns.md"))]

 pub mod c;
 pub mod cpp;
@ -68,7 +30,7 @@ pub enum Severity {
 impl Severity {
    /// Bracketed, colored, fixed-width tag for aligned console output.
    ///
-    /// Returns e.g. `"[HIGH]  "` or `"[MEDIUM]"` — always 8 visible characters
+    /// Returns e.g. `"[HIGH]  "` or `"[MEDIUM]"`, always 8 visible characters
    /// so the column after the tag lines up regardless of severity.
    #[allow(dead_code)] // public API for lib consumers
    pub fn colored_tag(self) -> String {
@ -123,9 +85,9 @@ impl FromStr for Severity {
 /// A parsed severity filter expression.
 ///
 /// Supports three forms:
-///   - Single level: `"HIGH"` — matches only that level
-///   - Comma list: `"HIGH,MEDIUM"` — matches any listed level
-///   - Threshold: `">=MEDIUM"` — matches that level and above
+///   - Single level: `"HIGH"`, matches only that level
+///   - Comma list: `"HIGH,MEDIUM"`, matches any listed level
+///   - Threshold: `">=MEDIUM"`, matches that level and above
 ///
 /// Parsing is case-insensitive and tolerates whitespace around tokens.
 #[derive(Debug, Clone, PartialEq, Eq)]
@ -242,7 +204,7 @@ impl PatternCategory {
 /// One AST pattern with a tree-sitter query and meta-data.
 #[derive(Debug, Clone, Serialize, PartialEq)]
 pub struct Pattern {
-    /// Unique identifier — `<lang>.<category>.<specific>` preferred.
+    /// Unique identifier, `<lang>.<category>.<specific>` preferred.
    pub id: &'static str,
    /// Human-readable explanation.
    pub description: &'static str,
--- a/src/patterns/python.rs
+++ b/src/patterns/python.rs
@ -5,7 +5,7 @@ use crate::patterns::{Pattern, PatternCategory, PatternTier, Severity};
 ///
 /// Taint rules cover `eval`/`exec`, `os.system`/`os.popen`/`subprocess.*`,
 /// and `cursor.execute`. AST patterns here add coverage for **deserialization**,
-/// **subprocess shell=True** (Tier B — taint doesn't check keyword args), and
+/// **subprocess shell=True** (Tier B, taint doesn't check keyword args), and
 /// **code execution** sinks that taint cannot structurally verify.
 pub const PATTERNS: &[Pattern] = &[
    // ── Tier A: Code execution ─────────────────────────────────────────
@ -121,14 +121,45 @@ pub const PATTERNS: &[Pattern] = &[
        confidence: Confidence::High,
    },
    // ── Tier B: SQL injection (format/concat heuristic) ────────────────
+    // Catches both `cursor.execute(query + user)` (binary_operator concat)
+    // and `cursor.execute(f"... {user} ...")` (f-string with interpolation).
+    // f-strings appear as a `string` node with `interpolation` children in
+    // tree-sitter-python; the alternation lets the same pattern cover both
+    // the historical % / + concat shapes and the modern f-string SQLi shape
+    // that surfaces in CVE-2025-24793 (snowflake-connector-python),
+    // CVE-2025-69662 (geopandas), and dozens of similar cursor.execute
+    // call sites across the corpus.
    Pattern {
        id: "py.sqli.execute_format",
-        description: "cursor.execute with string concatenation risks SQL injection",
+        description: "cursor.execute with string concatenation or f-string risks SQL injection",
        query: r#"(call
                     function: (attribute
                       attribute: (identifier) @fn (#eq? @fn "execute"))
                     arguments: (argument_list
-                       (binary_operator) @arg))
+                       [(binary_operator)
+                        (string (interpolation))] @arg))
+                   @vuln"#,
+        severity: Severity::Medium,
+        tier: PatternTier::B,
+        category: PatternCategory::SqlInjection,
+        confidence: Confidence::Medium,
+    },
+    // SQLAlchemy `text(<concat-or-fstring>)`, same Tier B heuristic
+    // applied to the SQLAlchemy raw-SQL constructor.  Catches the
+    // CVE-2025-69662 (geopandas) shape:
+    //   connection.execute(text(f"SELECT … '{geom_name}' …"))
+    // where the f-string interpolation is the injection point and the
+    // surrounding `connection.execute` would otherwise hide the unsafe
+    // construction from the simple execute_format pattern.
+    Pattern {
+        id: "py.sqli.text_format",
+        description: "sqlalchemy text() with f-string or string concat risks SQL injection",
+        query: r#"(call
+                     function: [(identifier) @fn (attribute attribute: (identifier) @fn)]
+                     (#eq? @fn "text")
+                     arguments: (argument_list
+                       [(binary_operator)
+                        (string (interpolation))] @arg))
                   @vuln"#,
        severity: Severity::Medium,
        tier: PatternTier::B,