docs: update inline references and improve XSS detection in Java servlet writers, refactor matchers for clarity and extend sanitizer support

2026-06-12 19:55:14 +02:00 · 2026-06-03 11:32:30 -05:00 · 2026-06-03 11:32:30 -05:00 · eb4332edb5
commit eb4332edb5
parent c2cd6f009e
56 changed files with 339 additions and 144 deletions
--- a/src/patterns/java.rs
+++ b/src/patterns/java.rs
@ -114,43 +114,72 @@ pub const PATTERNS: &[Pattern] = &[
        confidence: Confidence::Medium,
    },
    // ── Tier A: Weak crypto ────────────────────────────────────────────
+    //
+    // The `type:`/`object:` node is matched with the `(_)` wildcard and a
+    // text `#match?` rather than a bare `(type_identifier) (#eq? …)` so the
+    // fully-qualified call shapes that dominate real code (and the entire
+    // OWASP Benchmark) are caught: `new java.util.Random()` parses the type
+    // as a `scoped_type_identifier`, not a `type_identifier`, which the old
+    // `#eq? @t "Random"` query silently never matched (0 crypto findings on
+    // the whole corpus).  The fix keeps the reliable `#eq?` but captures the
+    // LAST type-name segment from either a bare `(type_identifier)` or the
+    // direct `(type_identifier)` child of a `(scoped_type_identifier)`, so
+    // both `new Random()` and `new java.util.Random()` match while
+    // `SecureRandom` (a different whole segment) does not.
    Pattern {
        id: "java.crypto.insecure_random",
        description: "new Random() (java.util.Random) is not cryptographically secure",
        query: r#"(object_creation_expression
-                     type: (type_identifier) @t (#eq? @t "Random"))
+                     type: [
+                       (type_identifier) @t
+                       (scoped_type_identifier (type_identifier) @t)
+                     ]
+                     (#eq? @t "Random"))
                   @vuln"#,
        severity: Severity::Low,
        tier: PatternTier::A,
        category: PatternCategory::Crypto,
        confidence: Confidence::Medium,
    },
+    // Weak crypto algorithm passed to a `getInstance("…")` factory, keyed on
+    // the algorithm string so the qualifier (`javax.crypto.Cipher` /
+    // `java.security.MessageDigest` FQN or a bare class) does not matter — the
+    // old per-class queries pinned `object: (identifier) "MessageDigest"` /
+    // `"Random"` and silently never matched the fully-qualified call shapes
+    // that dominate real code (0 crypto findings on the whole OWASP corpus).
+    // Three alternations, all proven to fire from this `(string_literal)`
+    // position:
+    //   * `^.des/` — single-DES *cipher transforms* (`"DES/CBC/PKCS5Padding"`).
+    //     The trailing `/` (mode separator) is required so the genuinely-weak
+    //     single-DES Cipher fires while a bare `KeyGenerator.getInstance("DES")`
+    //     key-spec and the stronger triple-DES `"DESede/…"` (which the OWASP
+    //     Benchmark labels benign) do NOT — `"DESe"` has no `/` after `des`.
+    //   * `^.(rc2|rc4|blowfish)` — broken stream/block ciphers (rare, real).
+    //   * `^.(md2|md4|md5|sha1|sha-1).$` — broken hash digests as the WHOLE
+    //     algorithm string (the trailing `.$` matches the closing quote so
+    //     `"SHA1PRNG"` / `"HmacSHA1"` / `"SHA-256"` do NOT match).
+    // `getInstance` with any of these is `Cipher`/`MessageDigest` by
+    // construction; strong transforms (`AES/CBC`, `AES/GCM`, `SHA-256`) miss.
    Pattern {
-        id: "java.crypto.weak_digest",
-        description: "MessageDigest.getInstance(\"MD5\"/\"SHA1\") uses a weak hash algorithm",
+        id: "java.crypto.weak_algorithm",
+        description: "Cipher/MessageDigest.getInstance with a broken algorithm (DES/RC4/MD5/SHA-1)",
        query: r#"(method_invocation
-                     object: (identifier) @c (#eq? @c "MessageDigest")
                     name: (identifier) @id (#eq? @id "getInstance")
                     arguments: (argument_list
-                       (string_literal) @alg (#match? @alg "(?i)(md5|sha-?1)")))
-                   @vuln"#,
-        severity: Severity::Low,
-        tier: PatternTier::A,
-        category: PatternCategory::Crypto,
-        confidence: Confidence::Medium,
-    },
-    // ── Tier A: XSS (servlet) ──────────────────────────────────────────
-    Pattern {
-        id: "java.xss.getwriter_print",
-        description: "response.getWriter().print/println writes output without encoding",
-        query: r#"(method_invocation
-                     object: (method_invocation
-                       name: (identifier) @gw (#eq? @gw "getWriter"))
-                     name: (identifier) @id (#match? @id "^(print|println|write)$"))
+                       (string_literal) @alg (#match? @alg "(?i)(^.des/|^.(rc2|rc4|blowfish)|^.(md2|md4|md5|sha1|sha-1).$)")))
                   @vuln"#,
        severity: Severity::Medium,
        tier: PatternTier::A,
-        category: PatternCategory::Xss,
-        confidence: Confidence::High,
+        category: PatternCategory::Crypto,
+        confidence: Confidence::Medium,
    },
+    // Tier A reflected-XSS was previously a bare syntactic match on every
+    // `response.getWriter().print/println/write(...)` regardless of whether the
+    // written value was attacker-controlled or already HTML-encoded.  On the
+    // OWASP Benchmark that fired ~4400 times at precision 0.05 (it flagged
+    // constant strings and `ESAPI.encoder().encodeForHTML(...)`-wrapped output
+    // identically to a raw tainted write).  Reflected XSS is now a taint sink
+    // (`Sink(Cap::HTML_ESCAPE)` on the servlet writer verbs in
+    // `labels/java.rs`), which fires only when an un-encoded tainted value
+    // reaches the writer, so the syntactic pattern is retired.
 ];