Performance and precision pass (#64)

This commit is contained in:
Eli Peter 2026-05-04 19:58:04 -04:00 committed by GitHub
parent c7c5e0f3a1
commit fb698d2c27
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
97 changed files with 9932 additions and 517 deletions

View file

@ -211,6 +211,41 @@ fn is_bounded_length_check(lower: &str) -> bool {
false
}
/// Normalise an identifier to its snake-case lowercase form so that
/// camelCase / PascalCase / SCREAMING variants line up against snake-cased
/// prefix lists (`is_safe`, `is_authorized`, `is_authenticated`).
///
/// Underscore is inserted at every case boundary:
/// - lowercase/digit → uppercase (`isSafe` → `is_safe`)
/// - uppercase → uppercase-then-lowercase (`HTTPClient` → `http_client`)
///
/// Inputs already in snake_case round-trip unchanged: `is_safe` → `is_safe`.
/// Used by `classify_condition` so a sanitiser predicate authored in any
/// of the dominant identifier conventions classifies the same.
pub(crate) fn to_snake_lower(s: &str) -> String {
let chars: Vec<char> = s.chars().collect();
let mut out = String::with_capacity(chars.len() + 4);
for i in 0..chars.len() {
let c = chars[i];
if c.is_ascii_uppercase() {
if i > 0 {
let prev = chars[i - 1];
let next = chars.get(i + 1).copied();
let between_camel = prev.is_ascii_lowercase() || prev.is_ascii_digit();
let acronym_end =
prev.is_ascii_uppercase() && next.is_some_and(|n| n.is_ascii_lowercase());
if (between_camel || acronym_end) && !out.ends_with('_') {
out.push('_');
}
}
out.push(c.to_ascii_lowercase());
} else {
out.push(c.to_ascii_lowercase());
}
}
out
}
/// Parse a leading non-negative integer literal (decimal only).
fn parse_leading_uint(s: &str) -> Option<u64> {
let mut n: u64 = 0;
@ -384,13 +419,35 @@ pub fn classify_condition(text: &str) -> PredicateKind {
.unwrap_or(callee_part)
.trim();
// Derive a snake-cased form from the **original** text so that
// camelCase identifiers (`isSafeRemoteUrl`, `isAuthorized`,
// `isValidUUID`) classify against the snake-cased prefix list
// (`is_safe`, `is_authorized`, `is_authenticated`) the same as
// `is_safe_remote_url` would. Required to recognise CVE-2026-33486
// (roadiz/documents `isSafeRemoteUrl` SSRF sanitiser) as a
// ValidationCall on the patched fixture. Mirrors the trim/strip
// pipeline above on case-preserved text so the snake form lines up
// with `bare`.
let orig_trimmed = text.trim_start_matches(['(', '!', ' ', '\t']);
let orig_trimmed = orig_trimmed
.strip_prefix("not ")
.unwrap_or(orig_trimmed)
.trim();
let orig_callee_part = orig_trimmed.split('(').next().unwrap_or("");
let orig_bare = orig_callee_part
.rsplit(['.', ':'])
.next()
.unwrap_or(orig_callee_part)
.trim();
let bare_snake = to_snake_lower(orig_bare);
// Validation
if bare.contains("valid")
|| bare.contains("check")
|| bare.contains("verify")
|| bare.starts_with("is_safe")
|| bare.starts_with("is_authorized")
|| bare.starts_with("is_authenticated")
|| bare_snake.starts_with("is_safe")
|| bare_snake.starts_with("is_authorized")
|| bare_snake.starts_with("is_authenticated")
{
return PredicateKind::ValidationCall;
}
@ -734,8 +791,12 @@ fn extract_validation_target(text: &str) -> Option<String> {
// not corrupt the argument substring.
let first_arg = first_call_arg(args_part)?;
// Strip reference operators (e.g. `&x` → `x`)
// Strip reference operators (e.g. `&x` → `x`) and PHP variable sigil
// (`$url` → `url`) so the extracted target lines up with the var-name
// form used in branch-narrowing. Mirrors the `$` strip already done by
// `extract_allowlist_target` for `in_array($cmd, $allowed)`.
let first_arg = first_arg.strip_prefix('&').unwrap_or(first_arg).trim();
let first_arg = first_arg.strip_prefix('$').unwrap_or(first_arg);
if !first_arg.is_empty() && is_identifier(first_arg) {
Some(first_arg.to_string())
@ -991,6 +1052,63 @@ mod tests {
);
}
#[test]
fn classify_camelcase_safety_validators_are_validation_call() {
// Real-CVE shape: roadiz/documents `isSafeRemoteUrl($url)` (CVE-2026-33486).
// Without snake-case normalisation, the bare `issaferemoteurl` would
// not match the `is_safe` prefix and the predicate would silently
// fall into `Comparison`/`Unknown`, leaving `$url` un-validated past
// the early-return.
assert_eq!(
classify_condition("self::isSafeRemoteUrl($url)"),
PredicateKind::ValidationCall
);
assert_eq!(
classify_condition("isAuthorized(user)"),
PredicateKind::ValidationCall
);
assert_eq!(
classify_condition("isAuthenticated(req)"),
PredicateKind::ValidationCall
);
// Acronym handling: `isValidUUID` → `is_valid_uuid` → contains "valid".
assert_eq!(
classify_condition("isValidUUID(id)"),
PredicateKind::ValidationCall
);
// Snake-case round-trips unchanged.
assert_eq!(
classify_condition("is_safe_remote_url(x)"),
PredicateKind::ValidationCall
);
}
#[test]
fn extract_validation_target_strips_php_dollar_sigil() {
// PHP `$url` strips the sigil so the extracted target lines up with
// the var-name form used in branch narrowing. Required for
// CVE-2026-33486 patched fixture to silence on `fopen($url, 'r')`.
assert_eq!(
extract_validation_target("self::isSafeRemoteUrl($url)"),
Some("url".to_string())
);
assert_eq!(
extract_validation_target("validate($input)"),
Some("input".to_string())
);
}
#[test]
fn to_snake_lower_handles_common_variants() {
assert_eq!(to_snake_lower("isSafeRemoteUrl"), "is_safe_remote_url");
assert_eq!(to_snake_lower("isValidUUID"), "is_valid_uuid");
assert_eq!(to_snake_lower("HTTPClient"), "http_client");
assert_eq!(to_snake_lower("IsSafe"), "is_safe");
assert_eq!(to_snake_lower("is_safe"), "is_safe");
assert_eq!(to_snake_lower("validate"), "validate");
assert_eq!(to_snake_lower(""), "");
}
#[test]
fn classify_validation_requires_paren() {
// `x_valid == true` should NOT be ValidationCall, no `(` call syntax.

View file

@ -1523,6 +1523,121 @@ fn apply_input_validator_branch_narrowing(
}
}
/// JS/TS Array-method validator-callback narrowing.
///
/// `arr.filter(isSafeIdentifier)`, `arr.find(isValidId)`, and the
/// `findLast` variant are gating array methods whose return value is
/// composed of elements that passed the callback. When the callback
/// argument resolves to a name `classify_input_validator_callee` tags
/// as `BooleanTrueIsValid` (`isValid…`, `isSafe…`, `hasValid…` and
/// snake-case variants), every element of the result satisfies the
/// validator, so the call's downstream sinks see the same flow as
/// validated taint.
///
/// The companion `if (isValidX(x)) use(x)` narrowing already exists in
/// [`apply_input_validator_branch_narrowing`]; this is the same idea
/// lifted to the call site for filter/find chains so taint stops at
/// the gate rather than leaking through subsequent
/// `Array[index]`/template/sink reads.
///
/// Strict-additive: if the callback's name does not match the
/// validator pattern (anonymous arrow, opaque identifier, etc.), the
/// helper is a no-op and the existing default propagation runs
/// unchanged.
///
/// Motivated by CVE-2026-42353 (i18next-http-middleware path
/// traversal): the patched fix is `languages.filter(utils.isSafeIdentifier)`
/// before forwarding `languages` into the backend connector, and the
/// dual deferred TS-side gap CVE-2026-25544 (Payload sqli).
fn try_array_method_validator_callback_narrowing(
inst: &SsaInst,
info: &NodeInfo,
callee: &str,
args: &[SmallVec<[SsaValue; 2]>],
return_bits: &mut Cap,
return_origins: &mut SmallVec<[TaintOrigin; 2]>,
state: &mut SsaTaintState,
transfer: &SsaTaintTransfer,
ssa: &SsaBody,
) -> bool {
if !matches!(transfer.lang, Lang::JavaScript | Lang::TypeScript) {
return false;
}
// Method-call shape: callee text contains a `.` and the trailing
// segment is one of the gating array methods. `findIndex` /
// `every` / `some` return scalar shapes (index, boolean) rather
// than a filtered collection so they are excluded — element-level
// validation does not apply to a numeric/boolean result.
let dot = match callee.rfind('.') {
Some(p) => p,
None => return false,
};
let method = &callee[dot + 1..];
if !matches!(method, "filter" | "find" | "findLast") {
return false;
}
// The first positional argument's callable name. Two channels:
// 1. `info.arg_callees` — populated by `extract_arg_callees`
// (`call_ident_of` walks call shapes inside the arg). Catches
// `arr.filter(cb())` and dotted-callback shapes where the
// tree-sitter node kind reaches `Kind::CallFn` or
// `Kind::CallMethod`.
// 2. SSA `value_defs[v].var_name` for the arg's first SSA value
// — covers the bare-identifier shape (`arr.filter(cb)`)
// where the AST node is a plain identifier and
// `extract_arg_callees` pushes `None` because there is no
// call to recurse into. This is the shape every patched
// CVE fix uses, so it is the dominant source of validator
// callbacks in real code.
let arg0 = match args.first() {
Some(a) => a,
None => return false,
};
let cb_from_arg_callees = info.arg_callees.first().and_then(|s| s.as_deref());
let cb_from_ssa = arg0.iter().find_map(|&v| {
ssa.value_defs
.get(v.0 as usize)
.and_then(|vd| vd.var_name.as_deref())
});
let cb_name = match cb_from_arg_callees.or(cb_from_ssa) {
Some(n) => n,
None => return false,
};
if crate::ssa::type_facts::classify_input_validator_callee(cb_name)
!= Some(InputValidatorPolarity::BooleanTrueIsValid)
{
return false;
}
// Strip every cap from the return value: the returned array (or
// single found element) is composed exclusively of elements the
// recognised validator approved. `Cap::all()` is the conservative
// ceiling because the validator's body is opaque to this layer; a
// future extension could narrow caps by inspecting the body's
// rejection patterns.
*return_bits = Cap::empty();
return_origins.clear();
// Mark the result's var_name as validated, mirroring the
// [`apply_input_validator_branch_narrowing`] insertion. Useful
// for direct same-name reads of the rebound array (`arr =
// arr.filter(p)` then `arr.length`) but does not propagate
// through Assigns to differently-named bindings (`const lng =
// arr[0]`); the `return_bits` strip above is what gates those
// downstream flows.
if let Some(name) = ssa
.value_defs
.get(inst.value.0 as usize)
.and_then(|vd| vd.var_name.as_deref())
{
if let Some(sym) = transfer.interner.get(name) {
state.validated_must.insert(sym);
state.validated_may.insert(sym);
}
}
true
}
/// Find the latest reaching SSA definition for `var_name` at the end of
/// `block`. Mirrors `crate::constraint::lower::resolve_single_var` but
/// avoids the cross-module privacy leak: callers in this module need it
@ -4081,6 +4196,24 @@ pub(super) fn transfer_inst(
}
}
// Receiver-side validator strip. Some method-call validators
// raise on failure rather than transforming a return value,
// so the canonical `Sanitizer` mechanism (which clears the
// return) is the wrong shape. After the call returns, the
// *receiver* (and any args carrying the same equivalence
// class) is proven to satisfy the validated property. Strip
// the registered cap from receiver+args here so that
// `path.relative_to(base)` clears `Cap::FILE_IO` from
// `path` for downstream uses. Motivated by CVE-2024-23334
// (aiohttp StaticResource symlink-bypass): the patched code
// calls `filepath.relative_to(self._directory)` inside a
// try/except and serves `filepath` afterwards.
if let Some(cap) =
crate::labels::lookup_receiver_validator(transfer.lang.as_str(), callee)
{
strip_cap_from_call_args(args, receiver, state, cap);
}
// Alias-aware sanitization: propagate through must-aliased field paths
if !sanitizer_bits.is_empty() {
if let Some(aliases) = transfer.base_aliases {
@ -4444,6 +4577,28 @@ pub(super) fn transfer_inst(
}
}
// JS/TS array-method validator-callback narrowing. When a
// call shape matches `<arr>.filter(<recognised-validator>)`
// (or `find` / `findLast`), strip the caps that flowed into
// `return_bits` from the receiver — the result holds only
// elements the validator approved. Strict-additive: the
// helper is a no-op when the callback name does not match
// the BooleanTrueIsValid bucket, leaving the default
// propagation result unchanged. See
// [`try_array_method_validator_callback_narrowing`] for the
// motivating CVE pair.
try_array_method_validator_callback_narrowing(
inst,
info,
callee,
args,
&mut return_bits,
&mut return_origins,
state,
transfer,
ssa,
);
// Constructor cap narrowing: a `new X(...)` call returns an object
// instance, not a string. Caps that name a string-shaped sink
// pattern (path argument, format string, URL component, JSON

View file

@ -6779,3 +6779,83 @@ const handler = (req, res) => {
"expected taint flow via double-call chain rebinding; got 0 findings",
);
}
/// CVE-2026-42353 i18next-http-middleware: the patched fix wraps a
/// tainted array in `arr.filter(isSafeIdentifier)` before forwarding.
/// `try_array_method_validator_callback_narrowing` recognises the
/// `<arr>.filter(<recognised-validator>)` shape on JS/TS and strips
/// the receiver-derived caps from the call result, so a downstream
/// `arr[0]` → template-literal → `fs.readFileSync` chain no longer
/// flags. The bare-identifier callback case is the dominant patched
/// shape — `extract_arg_callees` returns `None` for plain
/// identifiers (no inner call to recurse into), so the helper falls
/// back to the SSA value's `var_name` channel.
#[test]
fn cve_2026_42353_filter_isvalid_callback_strips_taint() {
let src = br#"
const fs = require('fs');
function isSafeIdentifier(v) {
return typeof v === 'string' && v.indexOf('..') === -1 && v.indexOf('/') === -1;
}
function handler(req, res) {
let languages = req.query.lng ? req.query.lng.split(' ') : [];
languages = languages.filter(isSafeIdentifier);
const lng = languages[0];
const filename = `/locales/${lng}.json`;
fs.readFileSync(filename);
}
"#;
let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE);
let file_cfg = parse_lang(src, "javascript", lang);
let summaries = &file_cfg.summaries;
let findings = analyse_file(
&file_cfg,
summaries,
None,
Lang::JavaScript,
"test.js",
&[],
None,
);
assert!(
findings.is_empty(),
"expected no taint flow when filtered through isSafeIdentifier; got {} findings",
findings.len(),
);
}
/// Negative regression for the array-method validator-callback gate:
/// the same shape WITHOUT the `filter(isSafe…)` step keeps the path
/// traversal flow alive end-to-end. Pins the precision claim — the
/// strip is element-of-array-after-filter scoped, not a wholesale
/// kill on any `<arr>.filter` call regardless of callback identity.
#[test]
fn cve_2026_42353_filter_without_validator_callback_preserves_taint() {
let src = br#"
const fs = require('fs');
function pickFirst(v) { return true; }
function handler(req, res) {
let languages = req.query.lng ? req.query.lng.split(' ') : [];
languages = languages.filter(pickFirst);
const lng = languages[0];
const filename = `/locales/${lng}.json`;
fs.readFileSync(filename);
}
"#;
let lang = tree_sitter::Language::from(tree_sitter_javascript::LANGUAGE);
let file_cfg = parse_lang(src, "javascript", lang);
let summaries = &file_cfg.summaries;
let findings = analyse_file(
&file_cfg,
summaries,
None,
Lang::JavaScript,
"test.js",
&[],
None,
);
assert!(
!findings.is_empty(),
"expected taint flow via filter(pickFirst) — pickFirst is not a recognised validator and must not strip taint; got 0 findings",
);
}