Performance and precision pass (#64)

This commit is contained in:
Eli Peter 2026-05-04 19:58:04 -04:00 committed by GitHub
parent c7c5e0f3a1
commit fb698d2c27
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
97 changed files with 9932 additions and 517 deletions

View file

@ -40,7 +40,7 @@ use crate::utils::ext::lowercase_ext;
use crate::utils::{Config, query_cache};
use petgraph::graph::NodeIndex;
use std::borrow::Cow;
use std::cell::RefCell;
use std::cell::{OnceCell, RefCell};
use std::collections::{HashMap, HashSet};
use std::ops::ControlFlow;
use std::path::Path;
@ -972,6 +972,27 @@ impl<'a> ParsedSource<'a> {
{
continue;
}
// Layer C2: PHP `Serializable::unserialize($input)` magic
// method body — `public function unserialize($x) { ...
// unserialize($x) ... }`. This is the legacy
// `Serializable` interface contract (deprecated since PHP
// 8.1). PHP itself invokes the method when restoring an
// instance, so the body's `\unserialize($x)` call cannot
// be removed without breaking the interface. The
// actionable signal is at the class level (the class
// implements Serializable — fix is to migrate to
// `__serialize` / `__unserialize`), not at this call
// site. Genuine deserialization sinks (free-function
// `unserialize($_GET[..])`, helpers reading from session
// / cache, etc.) keep firing because they are not inside
// a method declaration named `unserialize` with a single
// formal parameter passed straight to the call.
if cq.meta.id == "php.deser.unserialize"
&& self.lang_slug == "php"
&& is_php_unserialize_magic_method_passthrough(cap.node, self.bytes)
{
continue;
}
// Layer D: C/C++ buffer-overflow pattern rules
// (`{c,cpp}.memory.strcpy`, `strcat`, `sprintf`) fire
// syntactically on every call regardless of argument
@ -1102,6 +1123,13 @@ struct ParsedFile<'a> {
file_cfg: FileCfg,
lang_rules: LangAnalysisRules,
has_lang_rules: bool,
/// Per-body SSA + const-prop + type-fact cache, lazily populated on first
/// request and indexed by `BodyId.0`. Was being recomputed 2-3× per body
/// across `run_cfg_analyses_with_lowered` (cfg analyses + state analyses)
/// and `run_auth_analyses` (`collect_file_var_types`); on the gin profile
/// `build_body_const_facts` accounted for 13.6% of wall-clock and a
/// single-pass cache collapses that to ~4.5%.
body_const_facts_cache: OnceCell<Vec<Option<cfg_analysis::BodyConstFacts>>>,
}
impl<'a> ParsedFile<'a> {
@ -1153,9 +1181,33 @@ impl<'a> ParsedFile<'a> {
file_cfg,
lang_rules,
has_lang_rules,
body_const_facts_cache: OnceCell::new(),
}
}
/// Per-body const-fact cache, computed once on first request and shared
/// across every per-body iteration in this file's analysis. Indexed by
/// `BodyId.0` so callers can look up by body identity.
fn body_const_facts_all(&self) -> &[Option<cfg_analysis::BodyConstFacts>] {
self.body_const_facts_cache.get_or_init(|| {
let lang = Lang::from_slug(self.source.lang_slug).unwrap_or(Lang::Rust);
self.file_cfg
.bodies
.iter()
.map(|b| cfg_analysis::build_body_const_facts(b, lang))
.collect()
})
}
/// Look up the cached const facts for a specific body.
fn body_const_facts(
&self,
body: &crate::cfg::BodyCfg,
) -> Option<&cfg_analysis::BodyConstFacts> {
let all = self.body_const_facts_all();
all.get(body.meta.id.0 as usize).and_then(|f| f.as_ref())
}
/// The top-level body's CFG graph (for backward-compatible access).
fn cfg_graph(&self) -> &Cfg {
&self.file_cfg.toplevel().graph
@ -1468,7 +1520,7 @@ impl<'a> ParsedFile<'a> {
.filter(|f| f.body_id == body.meta.id)
.cloned()
.collect();
let body_const_facts = cfg_analysis::build_body_const_facts(body, caller_lang);
let body_const_facts = self.body_const_facts(body);
let cfg_ctx = cfg_analysis::AnalysisContext {
cfg: &body.graph,
entry: body.entry,
@ -1481,8 +1533,8 @@ impl<'a> ParsedFile<'a> {
taint_findings: &body_taint,
analysis_rules: self.rules_ref(),
taint_active,
body_const_facts: body_const_facts.as_ref(),
type_facts: body_const_facts.as_ref().map(|f| &f.type_facts),
body_const_facts,
type_facts: body_const_facts.map(|f| &f.type_facts),
auth_decorators: &body.meta.auth_decorators,
closure_released_var_names: Some(
closure_released_per_body
@ -1546,13 +1598,11 @@ impl<'a> ParsedFile<'a> {
// points-to facts so the proxy-acquire transfer can
// suppress SymbolId attribution on field-aliased
// receivers (e.g. `m := c.mu; m.Lock()`).
let body_pointer_hints = cfg_analysis::build_body_const_facts(body, caller_lang)
.as_ref()
.and_then(|f| {
f.pointer_facts
.as_ref()
.map(|pf| pf.name_proxy_hints(&f.ssa))
});
let body_pointer_hints = self.body_const_facts(body).and_then(|f| {
f.pointer_facts
.as_ref()
.map(|pf| pf.name_proxy_hints(&f.ssa))
});
let state_findings = state::run_state_analysis(
&body.graph,
body.entry,
@ -1666,12 +1716,11 @@ impl<'a> ParsedFile<'a> {
/// syntactic heuristics. Returns `None` when no body produces a
/// typed variable.
fn collect_file_var_types(&self) -> Option<auth_analysis::VarTypes> {
let caller_lang = Lang::from_slug(self.source.lang_slug).unwrap_or(Lang::Rust);
let mut merged: std::collections::HashMap<String, crate::ssa::type_facts::TypeKind> =
std::collections::HashMap::new();
let mut dropped: std::collections::HashSet<String> = std::collections::HashSet::new();
for body in &self.file_cfg.bodies {
let Some(facts) = cfg_analysis::build_body_const_facts(body, caller_lang) else {
let Some(facts) = self.body_const_facts(body) else {
continue;
};
for (idx, def) in facts.ssa.value_defs.iter().enumerate() {
@ -1792,6 +1841,7 @@ pub fn extract_auth_model_for_debug(
source.bytes,
source.path,
&rules,
None,
);
Ok(Some(model))
}
@ -2401,6 +2451,165 @@ fn is_php_unserialize_allowed_classes_restricted(
false
}
/// PHP-only: returns `true` when the captured `function_call_expression`
/// is the canonical `Serializable::unserialize($input)` magic-method
/// pass-through — i.e. the call is inside a `method_declaration` named
/// exactly `unserialize` (PHP method names are case-insensitive) with
/// one formal parameter, and the call's single argument is the bare
/// parameter variable.
///
/// **Why this is a non-actionable site for `php.deser.unserialize`:**
/// `Serializable::unserialize($input)` is an interface contract method
/// that PHP itself invokes when restoring an instance via the runtime
/// `\unserialize($bytes)` machinery. The implementation MUST decode
/// `$input` (the body's `\unserialize(...)` call) — there is no
/// "safer" rewrite that preserves the contract. The actionable signal
/// is at the class level (the class implements the deprecated
/// `Serializable` interface — fix is to migrate to `__serialize` /
/// `__unserialize`), not at this call site.
///
/// Conservative recognition:
/// - method must be a `method_declaration` (NOT a free `function_definition` —
/// the magic semantics only apply to instance methods)
/// - method name == `unserialize` (case-insensitive)
/// - exactly 1 formal parameter
/// - call has exactly 1 argument
/// - argument's inner expression is a `variable_name` whose name equals the
/// formal parameter's name
///
/// Genuine deserialization sinks (free `unserialize($_GET[...])`, helpers
/// reading from session/cache and passing through, etc.) keep firing
/// because they are not inside a method declaration named `unserialize`.
fn is_php_unserialize_magic_method_passthrough(cap_node: tree_sitter::Node, bytes: &[u8]) -> bool {
// The pattern captures `@n` (the function name); locate the enclosing
// function_call_expression.
let call_node = if cap_node.kind() == "function_call_expression" {
cap_node
} else {
let mut cur = cap_node;
let mut found = None;
for _ in 0..4 {
if cur.kind() == "function_call_expression" {
found = Some(cur);
break;
}
match cur.parent() {
Some(p) => cur = p,
None => break,
}
}
match found {
Some(c) => c,
None => return false,
}
};
// Walk up to the nearest method_declaration. Stop at any other
// function-introducing scope (free function, closure, arrow) — those
// are not the Serializable contract.
let mut cur = call_node;
let method = loop {
let Some(parent) = cur.parent() else {
return false;
};
match parent.kind() {
"method_declaration" => break parent,
"function_definition"
| "anonymous_function"
| "anonymous_function_creation_expression"
| "arrow_function"
| "program" => return false,
_ => {}
}
cur = parent;
};
// Method name must be exactly `unserialize` (case-insensitive).
let Some(name_node) = method
.child_by_field_name("name")
.or_else(|| find_named_child_of_kind(method, "name"))
else {
return false;
};
let Ok(method_name) = std::str::from_utf8(&bytes[name_node.byte_range()]) else {
return false;
};
if !method_name.eq_ignore_ascii_case("unserialize") {
return false;
}
// Method must have exactly 1 formal parameter; capture its bare name.
let Some(params) = method
.child_by_field_name("parameters")
.or_else(|| find_named_child_of_kind(method, "formal_parameters"))
else {
return false;
};
let mut formal_params: Vec<tree_sitter::Node> = Vec::new();
for i in 0..params.named_child_count() as u32 {
if let Some(p) = params.named_child(i)
&& matches!(
p.kind(),
"simple_parameter"
| "variadic_parameter"
| "property_promotion_parameter"
| "promoted_constructor_parameter"
)
{
formal_params.push(p);
}
}
if formal_params.len() != 1 {
return false;
}
let param = formal_params[0];
let var_node = param
.child_by_field_name("name")
.or_else(|| find_named_child_of_kind(param, "variable_name"));
let Some(var_node) = var_node else {
return false;
};
let inner_name_node = if var_node.kind() == "variable_name" {
var_node.named_child(0)
} else {
Some(var_node)
};
let Some(inner_name_node) = inner_name_node else {
return false;
};
let Ok(param_name) = std::str::from_utf8(&bytes[inner_name_node.byte_range()]) else {
return false;
};
// Call must have exactly 1 argument that is the bare parameter variable.
let Some(arg_list) = find_named_child_of_kind(call_node, "arguments") else {
return false;
};
let mut args: Vec<tree_sitter::Node> = Vec::new();
for i in 0..arg_list.named_child_count() as u32 {
if let Some(c) = arg_list.named_child(i)
&& c.kind() == "argument"
{
args.push(c);
}
}
if args.len() != 1 {
return false;
}
let inner = args[0].named_child(0);
let Some(inner) = inner else { return false };
if inner.kind() != "variable_name" {
return false;
}
let Some(arg_name_node) = inner.named_child(0) else {
return false;
};
let Ok(arg_name) = std::str::from_utf8(&bytes[arg_name_node.byte_range()]) else {
return false;
};
arg_name == param_name
}
/// C/C++-only Layer D: structural suppression of buffer-overflow pattern
/// rules when the source / format-string argument is a literal whose
/// contributed length is statically bounded.
@ -3999,6 +4208,15 @@ pub struct FusedResult {
crate::symbol::FuncKey,
auth_analysis::model::AuthCheckSummary,
)>,
/// Per-Python-file router-level dep declarations + `include_router`
/// edges for cross-file FastAPI router-dep propagation. `None` for
/// non-Python files; `Some((module_id, facts))` for Python files
/// where `module_id` is the file's
/// [`auth_analysis::router_facts::module_id_for_storage`] key.
/// Pass 1 collects these into
/// `GlobalSummaries.router_facts_by_module`; pass 2 resolves them
/// per-file via `GlobalSummaries::resolve_cross_file_router_deps`.
pub router_facts: Option<(String, auth_analysis::router_facts::PerFileRouterFacts)>,
}
/// Parse the file once, build the CFG once, and produce both function
@ -4034,6 +4252,7 @@ pub fn analyse_file_fused(
cfg_nodes: 0,
ssa_bodies: vec![],
auth_summaries: vec![],
router_facts: None,
});
};
@ -4081,6 +4300,28 @@ pub fn analyse_file_fused(
(vec![], vec![])
};
let mut auth_summaries: Vec<(
crate::symbol::FuncKey,
auth_analysis::model::AuthCheckSummary,
)> = Vec::new();
// Per-file router-dep facts for cross-file FastAPI propagation.
// Extracted unconditionally for Python files so pass 1 can persist
// them into `GlobalSummaries.router_facts_by_module` even on Cfg /
// Taint modes (the auth analysis itself runs only under Full, but
// the index has to be populated by the time pass 2 launches).
let router_facts_for_this_file = if parsed.source.lang_slug == "python" {
auth_analysis::router_facts::module_id_for_storage(parsed.source.path).map(|module_id| {
let facts = auth_analysis::router_facts::extract_router_facts_for_python(
&parsed.source.tree,
parsed.source.bytes,
);
(module_id, facts)
})
} else {
None
};
if cfg.scanner.mode == AnalysisMode::Full || cfg.scanner.mode == AnalysisMode::Ast {
let ast_findings = parsed.source.run_ast_queries(cfg);
// Layer B only applies when taint had the opportunity to evaluate
@ -4095,23 +4336,70 @@ pub fn analyse_file_fused(
} else {
out.extend(ast_findings);
}
out.extend(parsed.run_auth_analyses(cfg, global_summaries, scan_root));
// Build the AuthorizationModel exactly once per file when Full
// mode needs both diagnostics AND per-file summaries; pre-fix
// the diag path and the summary path each ran their own
// `extract::extract_authorization_model`, duplicating
// `collect_top_level_units` + every framework extractor's AST
// walk. See `auth_analysis::run_auth_analysis_with_model` for
// measured savings.
let auth_rules = auth_analysis::config::build_auth_rules(cfg, parsed.source.lang_slug);
if auth_rules.enabled {
// Resolve cross-file router-deps for the current file (Python only).
// The resolved map lives on `AuthorizationModel.cross_file_router_deps`
// BEFORE `FlaskExtractor::extract` runs, so the in-extractor merge
// sees both inline router-deps and the cross-file lift in one pass.
let cross_file_router_deps = if parsed.source.lang_slug == "python"
&& let Some(gs) = global_summaries
&& let Some(child_module_id) =
auth_analysis::router_facts::module_id_for_path(parsed.source.path)
{
let resolved = gs.resolve_cross_file_router_deps(&child_module_id);
if resolved.is_empty() {
None
} else {
Some(resolved)
}
} else {
None
};
let auth_model = auth_analysis::extract::extract_authorization_model(
parsed.source.lang_slug,
cfg.framework_ctx.as_ref(),
&parsed.source.tree,
parsed.source.bytes,
parsed.source.path,
&auth_rules,
cross_file_router_deps.as_ref(),
);
// Extract summaries from the **base** model (pre var-types,
// pre-helper-lifting) so the persisted per-file summary
// carries only the helper's own intrinsic auth checks,
// matching the legacy `extract_auth_summaries_by_key` path
// bit-for-bit.
if cfg.scanner.mode == AnalysisMode::Full {
auth_summaries = auth_analysis::extract_auth_summaries_from_model(
&auth_model,
parsed.source.lang_slug,
parsed.source.path,
scan_root,
);
}
let var_types = parsed.collect_file_var_types();
out.extend(auth_analysis::run_auth_analysis_with_model(
auth_model,
&parsed.source.tree,
parsed.source.lang_slug,
parsed.source.path,
&auth_rules,
var_types.as_ref(),
global_summaries,
scan_root,
));
}
}
parsed.source.finalize_diags(&mut out, cfg);
let auth_summaries = if cfg.scanner.mode == AnalysisMode::Full {
auth_analysis::extract_auth_summaries_by_key(
&parsed.source.tree,
parsed.source.bytes,
parsed.source.lang_slug,
parsed.source.path,
cfg,
scan_root,
)
} else {
Vec::new()
};
Ok(FusedResult {
summaries,
diags: out,
@ -4119,6 +4407,7 @@ pub fn analyse_file_fused(
cfg_nodes,
ssa_bodies,
auth_summaries,
router_facts: router_facts_for_this_file,
})
}
@ -4441,6 +4730,100 @@ fn php_unserialize_allowed_classes_recognises_safe_forms() {
);
}
#[test]
fn php_unserialize_magic_method_passthrough_recognises_serializable_contract() {
let mut parser = tree_sitter::Parser::new();
let lang = tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP);
parser.set_language(&lang).unwrap();
let q = r#"(function_call_expression function: (name) @n (#eq? @n "unserialize")) @vuln"#;
// Canonical Serializable::unserialize delegating to __unserialize.
let code = b"<?php\nclass R {\n public function unserialize($serialized): void {\n $this->__unserialize(unserialize($serialized));\n }\n}\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
assert!(
is_php_unserialize_magic_method_passthrough(cap, code),
"Serializable::unserialize($x) → unserialize($x) should be suppressed"
);
// Multi-target list-destructuring assignment shape (Joomla Cli/Input).
let code = b"<?php\nclass C {\n public function unserialize($input) {\n [$this->a, $this->b] = unserialize($input);\n }\n}\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
assert!(
is_php_unserialize_magic_method_passthrough(cap, code),
"list-destructuring inside Serializable::unserialize should be suppressed"
);
// Case-insensitive method name (PHP semantics).
let code = b"<?php\nclass C { public function UnSerialize($d) { return unserialize($d); } }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
assert!(
is_php_unserialize_magic_method_passthrough(cap, code),
"method name should match case-insensitively (PHP)"
);
// Free function `unserialize` is NOT a magic method, must NOT be suppressed.
let code = b"<?php\nfunction load($d) { return unserialize($d); }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
assert!(
!is_php_unserialize_magic_method_passthrough(cap, code),
"free function should NOT be suppressed"
);
// Different method name, NOT a Serializable contract, must NOT be suppressed.
let code = b"<?php\nclass C { public function decode($d) { return unserialize($d); } }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
assert!(
!is_php_unserialize_magic_method_passthrough(cap, code),
"method named `decode` should NOT be suppressed"
);
// Method named `unserialize` but with TWO params, NOT the magic signature,
// must NOT be suppressed.
let code = b"<?php\nclass C { public function unserialize($d, $opts) { return unserialize($d, $opts); } }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
assert!(
!is_php_unserialize_magic_method_passthrough(cap, code),
"two-param method named unserialize should NOT be suppressed"
);
// Magic-method signature but the call argument is NOT the formal param —
// user is unserializing some other source. Must NOT be suppressed.
let code = b"<?php\nclass C { public function unserialize($input) { return unserialize($_GET['x']); } }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
assert!(
!is_php_unserialize_magic_method_passthrough(cap, code),
"non-pass-through arg inside magic method should NOT be suppressed"
);
// Wrapped argument (`unserialize(trim($input))`) is NOT a bare-param
// pass-through — keep firing. This shape covers cache/session
// pass-throughs that the rule should still surface.
let code = b"<?php\nclass C { public function unserialize($input) { return unserialize(trim($input)); } }\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
assert!(
!is_php_unserialize_magic_method_passthrough(cap, code),
"wrapped argument inside magic method should NOT be suppressed (conservative)"
);
// Anonymous function named-like context (defensive — anonymous_function
// is not a method_declaration).
let code = b"<?php\n$f = function($input) { return unserialize($input); };\n";
let tree = parser.parse(code, None).unwrap();
let cap = first_php_capture(&tree, code, q);
assert!(
!is_php_unserialize_magic_method_passthrough(cap, code),
"closure should NOT be suppressed"
);
}
#[test]
fn php_weak_hash_non_crypto_use_recognises_canonical_shapes() {
let mut parser = tree_sitter::Parser::new();