new capacity bits (#67)

This commit is contained in:
Eli Peter 2026-05-07 01:29:31 -04:00 committed by GitHub
parent afaffc0df6
commit 7d0e7320e2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
261 changed files with 10591 additions and 231 deletions

View file

@ -31,6 +31,8 @@ pub mod param_points_to;
pub mod pointsto;
pub mod static_map;
pub mod type_facts;
pub mod xml_config;
pub mod xpath_config;
#[allow(unused_imports)]
pub use ir::*;
@ -51,6 +53,20 @@ pub struct OptimizeResult {
pub const_values: HashMap<SsaValue, const_prop::ConstLattice>,
/// Type fact analysis results.
pub type_facts: type_facts::TypeFactResult,
/// XML-parser configuration facts: per-receiver SSA value
/// `secure_processing` / `disallow_doctype` / `external_entities`
/// flags carried forward from setter calls and constructor kwargs.
/// Consumed by the SSA taint engine to suppress XXE on parse-class
/// sinks whose receiver was provably hardened.
#[serde(default)]
pub xml_parser_config: xml_config::XmlParserConfigResult,
/// XPath-receiver configuration facts: per-receiver SSA value
/// `has_resolver` flag set by `setXPathVariableResolver` calls.
/// Consumed by the SSA taint engine to suppress XPATH_INJECTION on
/// `evaluate` / `compile` sinks whose receiver was provably bound
/// to a variable resolver (parameterised XPath shape).
#[serde(default)]
pub xpath_config: xpath_config::XPathConfigResult,
/// Base-variable alias groups from copy propagation.
pub alias_result: alias::BaseAliasResult,
/// Points-to analysis: per-SSA-value abstract heap object sets.
@ -100,6 +116,17 @@ pub fn optimize_ssa_with_param_types(
let type_facts =
type_facts::analyze_types_with_param_types(body, cfg, &cp.values, lang, param_types);
// 5b. XML-parser config analysis. Tracks per-receiver hardening
// flags so XXE sinks can be suppressed when the parser was provably
// configured for secure processing.
let xml_parser_config = xml_config::analyze_xml_parser_config(body, cfg, &cp.values, lang);
// 5c. XPath-receiver config analysis. Tracks per-receiver
// `has_resolver` flag so `XPath.evaluate(taintedExpr, ...)` sinks
// can be suppressed when the receiver was bound to an
// `XPathVariableResolver` (parameterised-XPath shape).
let xpath_config = xpath_config::analyze_xpath_config(body, cfg, lang);
// 6. Points-to analysis (uses allocation site detection + SSA def-use)
let points_to = heap::analyze_points_to(body, cfg, lang);
@ -113,6 +140,8 @@ pub fn optimize_ssa_with_param_types(
OptimizeResult {
const_values: cp.values,
type_facts,
xml_parser_config,
xpath_config,
alias_result,
points_to,
module_aliases,

View file

@ -52,12 +52,55 @@ pub enum TypeKind {
/// where openmrs / xwiki / keycloak Hibernate DAOs build queries
/// via `cb.createQuery(Foo.class)` + `Root` / `Predicate` API.
JpaCriteriaQuery,
/// An LDAP directory-service client / connection (`DirContext`,
/// `LdapTemplate`, `Net::LDAP`, `ldap3.Connection`, `ldap.createClient`,
/// `ldap.DialURL`, etc.). Distinct from `DatabaseConnection` so the
/// type-qualified `LdapClient.search` rule fires only on directory
/// search APIs rather than every DB receiver with a `search` method.
LdapClient,
/// An XPath query / evaluation client (`DOMXPath`, `XPath`,
/// `XPathExpression`, `lxml.etree.XPath`, etc.). Distinct from
/// `DatabaseConnection` so the type-qualified `XPathClient.query` /
/// `XPathClient.evaluate` rules fire only on XPath APIs rather than
/// every receiver with a generic `query` / `evaluate` method (avoids
/// collision with PHP `$pdo->query` SQL_QUERY sink).
XPathClient,
/// A pre-parsed template object whose `process` / `merge` /
/// `render` method renders bound data through an already-compiled
/// template body. The SSTI vector is when the template *source*
/// fed to the constructor / factory was attacker-influenced; the
/// render-time call site is the sink. Currently populated by
/// `new freemarker.template.Template(...)`; the type-qualified
/// resolver rewrites `tpl.process(...)` → `Template.process` so
/// the existing flat SSTI rule fires on idiomatic
/// `Template tpl = new Template(...); tpl.process(model, out)`
/// shapes.
Template,
/// An XML parser instance produced by a JAXP factory call
/// (`DocumentBuilderFactory.newDocumentBuilder()`,
/// `SAXParserFactory.newSAXParser()`, `XMLReaderFactory.createXMLReader()`).
/// `DOMXPath` and friends keep their own `XPathClient` tag. Used so
/// the type-qualified `XmlParser.parse` rule fires on instance-style
/// calls (`builder.parse(input)`) without needing a flat-rule
/// matcher per concrete subclass. Also gates the XXE config-fact
/// suppression: only XmlParser-typed receivers consult the
/// [`crate::ssa::xml_config::XmlParserConfigResult`] sidecar.
XmlParser,
/// A framework-injected DTO body whose field types are known.
/// Populated when a parameter is recognised as a typed extractor and
/// the DTO class / struct / Pydantic model is resolvable in scope.
/// Strictly additive, without a DTO definition, callers fall back
/// to name-only resolution.
Dto(DtoFields),
/// An object created with `Object.create(null)` — has no prototype
/// chain, so subscript-write keys cannot pollute `Object.prototype`.
/// Populated for JS/TS values whose constructor call is
/// `Object.create(null)`. The PROTOTYPE_POLLUTION suppression at the
/// synthetic `__index_set__` sink consults this fact (via SSA receiver
/// value) so the suppression is flow-sensitive: if a phi join leaves
/// the receiver only sometimes null-prototyped, the fact widens to
/// `Unknown` and the sink fires on the unsafe path.
NullPrototypeObject,
}
/// structural carrier for a recognised DTO type. Maps
@ -99,6 +142,10 @@ impl TypeKind {
Self::Url => Some("URL"),
Self::RequestBuilder => Some("RequestBuilder"),
Self::JpaCriteriaQuery => Some("JpaCriteriaQuery"),
Self::LdapClient => Some("LdapClient"),
Self::XPathClient => Some("XPathClient"),
Self::XmlParser => Some("XmlParser"),
Self::Template => Some("Template"),
_ => None,
}
}
@ -288,9 +335,11 @@ pub fn is_safe_query_object_arg(
/// authoritative, and consumers see Unknown instead of a wrong
/// type tag.
///
/// `_args` and `_consts` are kept on the signature so we can later
/// add arg-shape narrowing when class-literal lowering captures
/// `Foo.class` as an arg-use.
/// `_args` and `_consts` allow arg-shape narrowing when an arg's
/// constant value distinguishes overloads. Reserved for future Java
/// `createQuery(Foo.class)` shape (the `Object.create(null)` case is
/// driven by the `produces_null_proto` CFG flag instead, since a
/// literal `null` arg leaves no SSA value to inspect).
fn arg_aware_call_type(
lang: Lang,
callee: &str,
@ -392,6 +441,40 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option<TypeKind> {
"createCriteriaUpdate" | "createCriteriaDelete" | "createTupleQuery" | "subquery" => {
Some(TypeKind::JpaCriteriaQuery)
}
// LDAP directory-service clients. `new InitialDirContext(env)` /
// `new InitialLdapContext(env, ctls)` instantiate the JNDI LDAP
// provider; `new LdapTemplate(...)` / `LdapTemplate.<init>` is the
// Spring LDAP wrapper. Both expose `search` / `searchByEntity`
// /`searchForObject` overloads where filter/DN strings are LDAP
// injection sinks.
"InitialDirContext" | "InitialLdapContext" | "LdapTemplate" => {
Some(TypeKind::LdapClient)
}
// JAXP factory-produced XML parser instances. Each is
// XXE-vulnerable by default until hardened with
// `setFeature(FEATURE_SECURE_PROCESSING, true)` (or
// disallow-doctype-decl, etc.). The
// [`crate::ssa::xml_config::XmlParserConfigResult`] sidecar
// suppresses the XXE bit at the type-qualified `XmlParser.parse`
// sink when the receiver carries a hardening fact.
"newDocumentBuilder" | "newSAXParser" | "getXMLReader" | "newXMLReader"
| "createXMLReader" => Some(TypeKind::XmlParser),
// `XPathFactory.newXPath()` returns a JAXP `XPath` instance.
// Mapping it to `XPathClient` lets the type-qualified resolver
// pick up `xpath.evaluate(...)` against the existing
// `XPathClient.evaluate` rule and lets the
// [`crate::ssa::xpath_config::XPathConfigResult`] sidecar
// suppress XPATH_INJECTION when the receiver was bound to an
// `XPathVariableResolver`.
"newXPath" => Some(TypeKind::XPathClient),
// Apache FreeMarker `new Template(name, reader, cfg)` /
// `cfg.getTemplate(name)`. The `Template` instance's
// `.process(model, out)` is an SSTI sink when the
// constructor source / template body came from tainted
// input. Type-qualified resolution rewrites
// `tpl.process(...)` → `Template.process` against the
// existing flat rule in `labels/java.rs`.
"Template" | "getTemplate" => Some(TypeKind::Template),
_ => None,
},
Lang::JavaScript | Lang::TypeScript => match suffix {
@ -409,6 +492,12 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option<TypeKind> {
// `elementsMap.get(id)`, `origIdToDuplicateId.get(...)`,
// `groupIdMapForOperation.set(...)` shapes).
"Map" | "Set" | "WeakMap" | "WeakSet" | "Array" => Some(TypeKind::LocalCollection),
// ldapjs client factory: `ldap.createClient({ url: '…' })` returns
// a Client whose `search(base, opts, cb)` is an LDAP injection
// sink. Match the qualified callee text rather than the bare
// `createClient` suffix to avoid widening to unrelated factories
// with the same verb name.
"createClient" if callee.contains("ldap") => Some(TypeKind::LdapClient),
_ => None,
},
Lang::Python => {
@ -429,6 +518,15 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option<TypeKind> {
} else if suffix == "open" && !callee.contains('.') {
// Bare `open()` is file I/O in Python
Some(TypeKind::FileHandle)
} else if callee == "ldap.initialize"
|| callee == "ldap3.Connection"
|| callee.ends_with(".initialize") && callee.contains("ldap")
{
// python-ldap: `conn = ldap.initialize(url)` returns an
// LDAPObject whose `search_s` / `search_ext_s` methods are
// LDAP-injection sinks. ldap3: `Connection(server, ...)`
// returns a Connection with a `search()` method.
Some(TypeKind::LdapClient)
} else {
None
}
@ -442,6 +540,10 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option<TypeKind> {
Some(TypeKind::FileHandle)
} else if callee.contains("url.") && suffix == "Parse" {
Some(TypeKind::Url)
} else if callee.contains("ldap.") && matches!(suffix, "Dial" | "DialURL" | "DialTLS") {
// go-ldap (`github.com/go-ldap/ldap/v3`): `conn, _ := ldap.DialURL(url)`
// returns `*ldap.Conn` whose `Search(req)` is an LDAP-injection sink.
Some(TypeKind::LdapClient)
} else {
None
}
@ -451,6 +553,10 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option<TypeKind> {
"curl_init" => Some(TypeKind::HttpClient),
"fopen" => Some(TypeKind::FileHandle),
"SplFileObject" => Some(TypeKind::FileHandle),
// DOMXPath: `$xp = new DOMXPath($doc)`. `$xp->query($expr)` /
// `$xp->evaluate($expr)` are XPath-injection sinks; without a
// distinct TypeKind they collide with the bare `query` SQL sink.
"DOMXPath" => Some(TypeKind::XPathClient),
_ => None,
},
Lang::C => match suffix {
@ -524,6 +630,11 @@ pub(crate) fn constructor_type(lang: Lang, callee: &str) -> Option<TypeKind> {
Some(TypeKind::DatabaseConnection)
} else if after_colons.starts_with("File.") && matches!(suffix, "open" | "new") {
Some(TypeKind::FileHandle)
} else if callee.contains("Net::LDAP") && matches!(suffix, "new" | "open") {
// net-ldap gem: `Net::LDAP.new(host: ...)` / `Net::LDAP.open`
// returns a connection whose `search(base:, filter:)` accepts
// an attacker-influenceable filter expression.
Some(TypeKind::LdapClient)
} else {
None
}
@ -768,8 +879,7 @@ pub fn analyze_types(
/// Same as [`analyze_types`] but seeds [`SsaOp::Param`] values with
/// per-position [`TypeKind`] facts from `param_types` (parallel-vec to
/// the function's BodyMeta.params). An entry of `None` (or an out-of-
/// range index) leaves the value at the default Param fact (Unknown),
/// preserving the pre-Phase-3 behaviour.
/// range index) leaves the value at the default Param fact (Unknown).
pub fn analyze_types_with_param_types(
body: &SsaBody,
cfg: &Cfg,
@ -810,8 +920,7 @@ pub fn analyze_types_with_param_types(
SsaOp::Param { index } => {
// Seed from the function's BodyMeta.param_types when
// a TypeKind was recovered at CFG construction time.
// Out-of-range / None entries fall back to Unknown,
// matching the pre-Phase-3 behaviour.
// Out-of-range / None entries fall back to Unknown.
match param_types.get(*index).and_then(|t| t.clone()) {
Some(tk) => TypeFact::from_kind(tk),
None => TypeFact::unknown(),
@ -820,7 +929,19 @@ pub fn analyze_types_with_param_types(
SsaOp::SelfParam => TypeFact::from_kind(TypeKind::Object),
SsaOp::CatchParam => TypeFact::from_kind(TypeKind::Object),
SsaOp::Call { callee, args, .. } => {
if let Some(ty) = lang.and_then(|l| constructor_type(l, callee)) {
// CFG marks `Object.create(null)` (and future
// null-prototype constructors) at lowering time.
// Honour it ahead of generic constructor / arg-aware
// dispatch so the returned SsaValue carries
// `NullPrototypeObject` for prototype-pollution
// suppression.
let null_proto = cfg
.node_weight(inst.cfg_node)
.map(|ni| ni.call.produces_null_proto)
.unwrap_or(false);
if null_proto {
TypeFact::from_kind(TypeKind::NullPrototypeObject)
} else if let Some(ty) = lang.and_then(|l| constructor_type(l, callee)) {
TypeFact::from_kind(ty)
} else if let Some(ty) =
lang.and_then(|l| arg_aware_call_type(l, callee, args, consts))
@ -1667,7 +1788,7 @@ mod tests {
/// Param values seeded from `param_types` must surface
/// the right TypeKind for downstream sink suppression. An out-of-
/// range index falls back to Unknown (the pre-Phase-3 default).
/// range index falls back to Unknown.
#[test]
fn param_types_seed_param_value_facts() {
use crate::cfg::Cfg;
@ -1728,7 +1849,7 @@ mod tests {
// Index 99 is out of range → falls back to Unknown.
assert_eq!(result.get_type(SsaValue(1)), Some(&TypeKind::Unknown));
// Empty slice = pre-Phase-3 behaviour.
// Empty slice = type-unaware fallback (analyze_types path).
let result2 = analyze_types(&body, &cfg, &consts, Some(Lang::Java));
assert_eq!(result2.get_type(SsaValue(0)), Some(&TypeKind::Unknown));
}
@ -2364,7 +2485,7 @@ mod tests {
));
}
// ── JPA Criteria query suppression (Phase: real-repo openmrs FP) ───
// ── JPA Criteria query suppression (real-repo openmrs FP) ─────────
//
// These tests pin the `TypeKind::JpaCriteriaQuery` variant + the
// `is_safe_query_object_arg` predicate + the

614
src/ssa/xml_config.rs Normal file
View file

@ -0,0 +1,614 @@
//! Per-SSA-value XML-parser configuration tracking.
//!
//! Tracks "is this XML parser configured to disable external entities / DTD
//! resolution" facts on parser-receiver SSA values. When a parse-class sink
//! is reached and the receiver is provably configured for secure processing,
//! the XXE bit is stripped from the sink's cap mask.
//!
//! The pass is intentionally a small forward dataflow run alongside type-fact
//! analysis. It does NOT flow through the SSA taint engine's worklist. Phi
//! nodes propagate the meet of operand configs (a flag is "set" only when all
//! reaching operands set it), and copy assignments propagate the receiver's
//! config. Recognised setter calls update the receiver's config in place;
//! identity-style transformer calls that produce a child parser (e.g.
//! `factory.newDocumentBuilder()`) inherit the receiver's config into the
//! result value.
use std::collections::HashMap;
use super::const_prop::ConstLattice;
use super::ir::*;
use crate::cfg::Cfg;
use crate::symbol::Lang;
use serde::{Deserialize, Serialize};
/// Receiver-instance config carried forward from setter calls.
///
/// All flags default to `false` (parser may be unsafe). A `true` flag
/// means: we have proven this parser was hardened along this control-flow
/// path. The XXE-suppression check is `secure_processing ||
/// disallow_doctype` — either gate is sufficient to neutralise external
/// entity resolution in JAXP / lxml / xml2js.
///
/// `external_entities` is the *unsafe* polarity: when set to `true`, the
/// parser was explicitly opted into external-entity resolution (e.g.
/// `XMLParser(resolve_entities=True)`). A parse call with this flag
/// retains XXE even if the language default would otherwise be safe.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct XmlParserConfig {
pub secure_processing: bool,
pub disallow_doctype: bool,
pub external_entities: bool,
}
impl XmlParserConfig {
/// True when the parser is provably hardened against XXE.
pub fn is_secure(&self) -> bool {
(self.secure_processing || self.disallow_doctype) && !self.external_entities
}
/// Phi-meet: a flag survives only when *both* operands set it. Used
/// when the parser variable was reassigned across branches.
fn meet(&self, other: &Self) -> Self {
XmlParserConfig {
secure_processing: self.secure_processing && other.secure_processing,
disallow_doctype: self.disallow_doctype && other.disallow_doctype,
// Unsafe polarity: ANY branch enabling external entities
// contaminates the join. Conservative w.r.t. XXE.
external_entities: self.external_entities || other.external_entities,
}
}
/// Union: caller updates the same receiver across multiple setter
/// calls. All known-safe flags accumulate; unsafe is sticky.
fn union(&self, other: &Self) -> Self {
XmlParserConfig {
secure_processing: self.secure_processing || other.secure_processing,
disallow_doctype: self.disallow_doctype || other.disallow_doctype,
external_entities: self.external_entities || other.external_entities,
}
}
}
/// Result of XML-parser config analysis.
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct XmlParserConfigResult {
pub configs: HashMap<SsaValue, XmlParserConfig>,
}
impl XmlParserConfigResult {
/// True when the value carries a config fact proving secure processing.
pub fn is_secure(&self, v: SsaValue) -> bool {
self.configs.get(&v).is_some_and(|c| c.is_secure())
}
/// True when the value was explicitly opted into external-entity
/// resolution (e.g. lxml `resolve_entities=True`).
pub fn is_unsafe_explicit(&self, v: SsaValue) -> bool {
self.configs.get(&v).is_some_and(|c| c.external_entities)
}
}
/// Suppress the `Cap::XXE` bit when the receiver of an XXE-class sink
/// was provably hardened. Returns `true` when XXE should be stripped
/// from the sink's cap mask.
///
/// Conservative defaults:
/// * No receiver SSA value (free function) → returns `false` (cannot
/// prove safety, fall through to existing classification).
/// * Receiver carries no config fact → returns `false`.
/// * `external_entities` flag is set → returns `false` even if a safe
/// flag is also set, since the unsafe opt-in dominates.
pub fn xxe_safe(receiver: Option<SsaValue>, xml_config: &XmlParserConfigResult) -> bool {
let Some(rv) = receiver else {
return false;
};
xml_config.is_secure(rv)
}
/// Per-call analysis result: how this call mutates the parser-config
/// universe.
#[allow(dead_code)] // SeedResult reserved for future constructor-driven seeding
enum ConfigEffect {
/// No effect on parser configuration.
None,
/// Update the call's receiver in place by OR-ing the supplied config
/// into its current config. Used for setter calls
/// (`factory.setFeature(FEATURE_SECURE_PROCESSING, true)`).
UpdateReceiver(XmlParserConfig),
/// Inherit the receiver's config into the call's result value.
/// Used for identity-style transformer calls
/// (`factory.newDocumentBuilder()` returns a builder that shares
/// the factory's hardening state).
InheritFromReceiver,
/// Initialise the call's result value with the supplied config.
/// Used for constructor calls whose options reveal the unsafe-explicit
/// opt-in (`new XMLParser({ processEntities: true })`,
/// `lxml.etree.XMLParser(resolve_entities=True)`).
SeedResult(XmlParserConfig),
}
/// Classify a Call instruction's effect on the parser-config universe.
///
/// `arg_const` looks up the const-lattice value for an SSA arg position
/// (returns `None` if the position is out of range or the SSA value is
/// not a known constant). Setter detection consults arg-0 (the feature
/// name) and arg-1 (the boolean flag).
///
/// `arg_idents` is the matching CFG-level [`info.call.arg_uses`] vector
/// (per-position identifier text from the source AST). Used to recover
/// non-literal feature names like `XMLConstants.FEATURE_SECURE_PROCESSING`
/// or bare identifiers (`FEATURE_SECURE_PROCESSING`, `Boolean.TRUE`)
/// that const-propagation cannot fold to a literal.
///
/// `arg_literals` is the matching CFG-level
/// [`info.call.arg_string_literals`] vector (per-position literal text;
/// strings, booleans, and null/nil/None tokens). Used to recover the
/// boolean polarity of `setFeature(NAME, true)` since SSA lowering does
/// not bind boolean arg literals to any SSA value (`arg_uses` skips them
/// because they are not identifiers).
fn classify_call(
lang: Lang,
callee: &str,
args: &[smallvec::SmallVec<[SsaValue; 2]>],
receiver: Option<SsaValue>,
consts: &HashMap<SsaValue, ConstLattice>,
arg_idents: &[Vec<String>],
arg_literals: &[Option<String>],
) -> ConfigEffect {
let suffix = callee.rsplit(['.', ':']).next().unwrap_or(callee);
// Helper: lookup the const lattice for arg N's first SSA value.
let arg_const = |n: usize| -> Option<&ConstLattice> {
args.get(n)
.and_then(|vals| vals.first())
.and_then(|v| consts.get(v))
};
// Helper: text of the const lattice (for string/identifier comparison).
let arg_text = |n: usize| -> Option<String> {
match arg_const(n)? {
ConstLattice::Str(s) => Some(s.clone()),
ConstLattice::Bool(b) => Some(b.to_string()),
ConstLattice::Int(i) => Some(i.to_string()),
_ => None,
}
};
// Helper: textual identifier(s) at arg N from the CFG node. Non-literal
// feature names (`XMLConstants.FEATURE_SECURE_PROCESSING`, bare
// `FEATURE_SECURE_PROCESSING`, etc.) surface here.
let arg_ident_text = |n: usize| -> Vec<&str> {
arg_idents
.get(n)
.map(|v| v.iter().map(|s| s.as_str()).collect())
.unwrap_or_default()
};
let arg_bool = |n: usize| -> Option<bool> {
if let Some(b) = arg_const(n).and_then(|c| match c {
ConstLattice::Bool(b) => Some(*b),
ConstLattice::Str(s) => match s.as_str() {
"True" | "true" => Some(true),
"False" | "false" => Some(false),
_ => None,
},
_ => None,
}) {
return Some(b);
}
// Fallback: tree-sitter classifies `true` / `false` as bare
// identifiers in some grammars. Inspect the arg's use list.
for tok in arg_ident_text(n) {
match tok {
"true" | "True" | "Boolean.TRUE" => return Some(true),
"false" | "False" | "Boolean.FALSE" => return Some(false),
_ => {}
}
}
// Fallback: literal tokens lifted by `extract_arg_string_literals`
// (booleans / null / numeric tokens). Java `setFeature(NAME, true)`
// does not bind the `true` token to any SSA value, but the literal
// surfaces here so the polarity can still be read.
if let Some(Some(lit)) = arg_literals.get(n) {
match lit.as_str() {
"true" | "True" | "Boolean.TRUE" => return Some(true),
"false" | "False" | "Boolean.FALSE" => return Some(false),
_ => {}
}
}
None
};
match lang {
Lang::Java => match suffix {
// `factory.setFeature(NAME, BOOL)` — the canonical JAXP
// hardening switch. Three feature names matter:
// * `FEATURE_SECURE_PROCESSING` (XMLConstants.FEATURE_SECURE_PROCESSING)
// * `http://apache.org/xml/features/disallow-doctype-decl`
// * `http://xml.org/sax/features/external-general-entities`
// * `http://xml.org/sax/features/external-parameter-entities`
// The first two harden by being SET TRUE; the entity ones
// harden by being SET FALSE.
"setFeature" => {
if receiver.is_none() {
return ConfigEffect::None;
}
let name_lit = arg_text(0).unwrap_or_default();
let name_idents = arg_ident_text(0);
let value = arg_bool(1);
let any_ident = |needle: &str| name_idents.iter().any(|s| s.contains(needle));
let mut cfg = XmlParserConfig::default();
if name_lit == "FEATURE_SECURE_PROCESSING"
|| name_lit.contains("XMLConstants.FEATURE_SECURE_PROCESSING")
|| name_lit.contains("javax.xml.XMLConstants/feature/secure-processing")
|| any_ident("FEATURE_SECURE_PROCESSING")
{
if value == Some(true) {
cfg.secure_processing = true;
}
} else if name_lit.contains("disallow-doctype-decl")
|| any_ident("disallow-doctype-decl")
{
if value == Some(true) {
cfg.disallow_doctype = true;
}
} else if (name_lit.contains("external-general-entities")
|| name_lit.contains("external-parameter-entities")
|| name_lit.contains("load-external-dtd")
|| any_ident("external-general-entities")
|| any_ident("external-parameter-entities")
|| any_ident("load-external-dtd"))
&& value == Some(false)
{
cfg.disallow_doctype = true;
}
if cfg == XmlParserConfig::default() {
ConfigEffect::None
} else {
ConfigEffect::UpdateReceiver(cfg)
}
}
// `factory.setExpandEntityReferences(false)` —
// DocumentBuilderFactory legacy hardening switch.
"setExpandEntityReferences" => {
if receiver.is_none() {
return ConfigEffect::None;
}
if arg_bool(0) == Some(false) {
ConfigEffect::UpdateReceiver(XmlParserConfig {
disallow_doctype: true,
..Default::default()
})
} else {
ConfigEffect::None
}
}
// `factory.newDocumentBuilder()` / `factory.newSAXParser()` /
// `parser.getXMLReader()` propagate the hardening state from
// the factory (receiver) onto the produced parser instance
// (return value). Without this propagation, a hardened
// factory's child builder would parse with no config.
"newDocumentBuilder" | "newSAXParser" | "getXMLReader" | "newXMLReader" => {
if receiver.is_some() {
ConfigEffect::InheritFromReceiver
} else {
ConfigEffect::None
}
}
_ => ConfigEffect::None,
},
Lang::Python => {
// `lxml.etree.XMLParser(resolve_entities=False)` — the lxml
// parser default resolves entities; the keyword argument
// changes that. Const-propagation will not generally see the
// kwarg value here (kwargs land in `info.call.kwargs`, not
// positional args), so we treat the constructor as a
// best-effort initialiser keyed off the keyword's literal
// text via the static-map. When neither keyword surfaces,
// the parser keeps the default-empty config.
if callee.ends_with("etree.XMLParser") || suffix == "XMLParser" {
// Positional kwargs aren't reliable here; rely on the
// call's static-map kwargs (handled by the per-callsite
// pass below). Fall through to None at this layer.
ConfigEffect::None
} else {
ConfigEffect::None
}
}
_ => ConfigEffect::None,
}
}
/// Run the XML-parser config analysis on an SSA body.
pub fn analyze_xml_parser_config(
body: &SsaBody,
cfg: &Cfg,
consts: &HashMap<SsaValue, ConstLattice>,
lang: Option<Lang>,
) -> XmlParserConfigResult {
let Some(lang) = lang else {
return XmlParserConfigResult::default();
};
let mut configs: HashMap<SsaValue, XmlParserConfig> = HashMap::new();
// Helper: read the kwargs attached to the original CFG node for the
// call instruction at hand. Used for languages where parser
// hardening flags arrive as keyword arguments (Python lxml).
let lookup_kwargs = |node_idx: petgraph::graph::NodeIndex| -> Vec<(String, Vec<String>)> {
cfg.node_weight(node_idx)
.map(|ni| ni.call.kwargs.clone())
.unwrap_or_default()
};
// Helper: read the positional arg-use identifier vectors (e.g.
// `XMLConstants.FEATURE_SECURE_PROCESSING` surfaces as a dotted path
// here even when const-prop folds it to nothing).
let lookup_arg_idents = |node_idx: petgraph::graph::NodeIndex| -> Vec<Vec<String>> {
cfg.node_weight(node_idx)
.map(|ni| ni.call.arg_uses.clone())
.unwrap_or_default()
};
// Helper: read the per-position literal-token vector
// (`arg_string_literals` lifts strings, booleans, null tokens, and
// numeric tokens — see `extract_arg_string_literals`).
let lookup_arg_literals = |node_idx: petgraph::graph::NodeIndex| -> Vec<Option<String>> {
cfg.node_weight(node_idx)
.map(|ni| ni.call.arg_string_literals.clone())
.unwrap_or_default()
};
// Pass 1 — direct effects from Call instructions in source order.
// Setter updates and constructor seeds are effectively monotone
// (we OR safe flags onto the receiver / value), so a single pass is
// sufficient when phi nodes only appear after the setter. Pass 2
// below handles phi/copy propagation.
for block in &body.blocks {
for inst in block.body.iter() {
if let SsaOp::Call {
callee,
args,
receiver,
..
} = &inst.op
{
// Python lxml.etree.XMLParser(resolve_entities=...): the
// kwarg lives on the CFG node's `kwargs` list, not in
// the SSA Call args. Inspect it directly.
if matches!(lang, Lang::Python)
&& (callee.ends_with("etree.XMLParser")
|| callee.rsplit(['.', ':']).next() == Some("XMLParser"))
{
let kwargs = lookup_kwargs(inst.cfg_node);
for (name, values) in &kwargs {
if name == "resolve_entities" {
// Look up the literal text on the matching
// argument; tree-sitter-python keywords surface
// the value identifier in the `values` slot.
if values.iter().any(|v| v == "True" || v == "true") {
let entry = configs.entry(inst.value).or_default();
entry.external_entities = true;
} else if values.iter().any(|v| v == "False" || v == "false") {
let entry = configs.entry(inst.value).or_default();
entry.disallow_doctype = true;
}
}
if name == "no_network" && values.iter().any(|v| v == "True" || v == "true")
{
let entry = configs.entry(inst.value).or_default();
entry.disallow_doctype = true;
}
}
continue;
}
// JS/TS: `new XMLParser({ processEntities: true, ... })`.
// The fast-xml-parser constructor's option-object fields
// are not exposed via const-prop, but the CFG layer
// captures string-literal kwargs in the call's
// `arg_string_literals` for object-literal positions.
// For now, mark the result as unsafe-explicit only when
// the static-kwargs list carries `processEntities=true`.
if matches!(lang, Lang::JavaScript | Lang::TypeScript)
&& (callee.ends_with("XMLParser") || callee.ends_with(".XMLParser"))
{
let kwargs = lookup_kwargs(inst.cfg_node);
for (name, values) in &kwargs {
if name == "processEntities" && values.iter().any(|v| v == "true") {
let entry = configs.entry(inst.value).or_default();
entry.external_entities = true;
}
}
continue;
}
let arg_idents = lookup_arg_idents(inst.cfg_node);
let arg_literals = lookup_arg_literals(inst.cfg_node);
match classify_call(
lang,
callee,
args,
*receiver,
consts,
&arg_idents,
&arg_literals,
) {
ConfigEffect::None => {}
ConfigEffect::UpdateReceiver(delta) => {
if let Some(rv) = *receiver {
let entry = configs.entry(rv).or_default();
*entry = entry.union(&delta);
}
}
ConfigEffect::InheritFromReceiver => {
if let Some(rv) = *receiver
&& let Some(parent) = configs.get(&rv).copied()
{
let entry = configs.entry(inst.value).or_default();
*entry = entry.union(&parent);
}
}
ConfigEffect::SeedResult(seed) => {
let entry = configs.entry(inst.value).or_default();
*entry = entry.union(&seed);
}
}
}
}
}
// Pass 2 — fixed-point propagation through copy assignments and phi
// joins. Caps the iteration count: in practice 2-3 rounds suffice
// on intra-procedural shapes.
for _ in 0..6 {
let mut changed = false;
for block in &body.blocks {
for inst in &block.phis {
if let SsaOp::Phi(operands) = &inst.op {
let mut acc: Option<XmlParserConfig> = None;
for (_, val) in operands {
let cfg_val = configs.get(val).copied().unwrap_or_default();
acc = Some(match acc {
None => cfg_val,
Some(prev) => prev.meet(&cfg_val),
});
}
if let Some(joined) = acc
&& joined != XmlParserConfig::default()
{
let prev = configs.get(&inst.value).copied();
if prev != Some(joined) {
configs.insert(inst.value, joined);
changed = true;
}
}
}
}
for inst in &block.body {
if let SsaOp::Assign(uses) = &inst.op
&& uses.len() == 1
&& let Some(src_cfg) = configs.get(&uses[0]).copied()
&& src_cfg != XmlParserConfig::default()
{
let prev = configs.get(&inst.value).copied().unwrap_or_default();
let new_cfg = prev.union(&src_cfg);
if Some(new_cfg) != configs.get(&inst.value).copied() {
configs.insert(inst.value, new_cfg);
changed = true;
}
}
// InheritFromReceiver may need a re-pass when the
// receiver's config was set after the call itself was
// visited (e.g. the call appears in a later block whose
// dominator chain only resolves on the second iteration).
if let SsaOp::Call {
callee,
receiver: Some(rv),
..
} = &inst.op
{
let suffix = callee.rsplit(['.', ':']).next().unwrap_or(callee);
let inherit = matches!(lang, Lang::Java)
&& matches!(
suffix,
"newDocumentBuilder" | "newSAXParser" | "getXMLReader" | "newXMLReader"
);
if inherit && let Some(parent) = configs.get(rv).copied() {
let prev = configs.get(&inst.value).copied().unwrap_or_default();
let new_cfg = prev.union(&parent);
if Some(new_cfg) != configs.get(&inst.value).copied()
&& new_cfg != XmlParserConfig::default()
{
configs.insert(inst.value, new_cfg);
changed = true;
}
}
}
}
}
if !changed {
break;
}
}
XmlParserConfigResult { configs }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn default_config_is_unsafe() {
let c = XmlParserConfig::default();
assert!(!c.is_secure());
}
#[test]
fn secure_processing_alone_is_safe() {
let c = XmlParserConfig {
secure_processing: true,
..Default::default()
};
assert!(c.is_secure());
}
#[test]
fn external_entities_overrides_safe_flag() {
let c = XmlParserConfig {
secure_processing: true,
external_entities: true,
..Default::default()
};
assert!(!c.is_secure());
}
#[test]
fn meet_keeps_only_intersection_of_safe_flags() {
let a = XmlParserConfig {
secure_processing: true,
disallow_doctype: true,
..Default::default()
};
let b = XmlParserConfig {
secure_processing: true,
..Default::default()
};
let m = a.meet(&b);
assert!(m.secure_processing);
assert!(!m.disallow_doctype);
}
#[test]
fn meet_propagates_unsafe_flag() {
let a = XmlParserConfig {
secure_processing: true,
..Default::default()
};
let b = XmlParserConfig {
external_entities: true,
..Default::default()
};
let m = a.meet(&b);
// Unsafe sticky → no longer secure even though one branch was.
assert!(!m.is_secure());
}
#[test]
fn xxe_safe_returns_false_without_receiver() {
let result = XmlParserConfigResult::default();
assert!(!xxe_safe(None, &result));
}
#[test]
fn xxe_safe_uses_receiver_config() {
let mut configs = HashMap::new();
configs.insert(
SsaValue(7),
XmlParserConfig {
secure_processing: true,
..Default::default()
},
);
let result = XmlParserConfigResult { configs };
assert!(xxe_safe(Some(SsaValue(7)), &result));
assert!(!xxe_safe(Some(SsaValue(8)), &result));
}
}

235
src/ssa/xpath_config.rs Normal file
View file

@ -0,0 +1,235 @@
//! Per-SSA-value XPath-receiver configuration tracking.
//!
//! Mirrors [`crate::ssa::xml_config`] but for `XPath` instances rather
//! than JAXP parser instances. Tracks "is this XPath receiver bound to
//! an `XPathVariableResolver`" along the control-flow path: when a
//! resolver has been bound, subsequent `xpath.evaluate(expr, ...)` calls
//! are treated as parameterised and the `XPATH_INJECTION` bit is
//! stripped from the sink's cap mask.
//!
//! Same engine shape as [`crate::ssa::xml_config::XmlParserConfigResult`]:
//! a small forward dataflow run alongside type-fact analysis. Phi nodes
//! propagate the meet of operand configs (a flag is "set" only when all
//! reaching operands set it), copy assignments propagate the receiver's
//! config, and `setXPathVariableResolver` calls update the receiver's
//! config in place.
use std::collections::HashMap;
use super::ir::*;
use crate::cfg::Cfg;
use crate::symbol::Lang;
use serde::{Deserialize, Serialize};
/// Receiver-instance config carried forward from `setXPathVariableResolver`
/// calls. All flags default to `false` (resolver not bound). A `true`
/// flag means: we have proven this XPath receiver was configured for
/// parameterised evaluation along this control-flow path.
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct XPathReceiverConfig {
/// True when `xpath.setXPathVariableResolver(...)` has been called
/// on this receiver. Set by Pass 1 on the receiver SSA value;
/// propagated through phi joins (meet) and copy assignments (union).
pub has_resolver: bool,
}
impl XPathReceiverConfig {
/// True when the receiver is provably bound to a variable resolver.
pub fn is_parameterised(&self) -> bool {
self.has_resolver
}
/// Phi-meet: a flag survives only when *both* operands set it. Used
/// when the XPath variable was reassigned across branches and only
/// some branches bound a resolver.
fn meet(&self, other: &Self) -> Self {
XPathReceiverConfig {
has_resolver: self.has_resolver && other.has_resolver,
}
}
/// Union: caller binds a resolver after a copy / phi-join. Any
/// branch setting the flag wins for the union (used for copy
/// propagation, which preserves the source value's flags).
fn union(&self, other: &Self) -> Self {
XPathReceiverConfig {
has_resolver: self.has_resolver || other.has_resolver,
}
}
}
/// Result of XPath-receiver config analysis.
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
pub struct XPathConfigResult {
pub configs: HashMap<SsaValue, XPathReceiverConfig>,
}
impl XPathConfigResult {
/// True when the value carries a config fact proving resolver
/// binding.
pub fn is_parameterised(&self, v: SsaValue) -> bool {
self.configs.get(&v).is_some_and(|c| c.is_parameterised())
}
}
/// Suppress the `Cap::XPATH_INJECTION` bit when the receiver of an XPath
/// `evaluate` / `compile` sink was provably bound to a variable
/// resolver. Returns `true` when XPATH_INJECTION should be stripped
/// from the sink's cap mask.
///
/// Conservative defaults:
/// * No receiver SSA value (free function) → returns `false` (cannot
/// prove safety, fall through to existing classification).
/// * Receiver carries no config fact → returns `false`.
pub fn xpath_safe(receiver: Option<SsaValue>, xpath_config: &XPathConfigResult) -> bool {
let Some(rv) = receiver else {
return false;
};
xpath_config.is_parameterised(rv)
}
/// Run the XPath-receiver config analysis on an SSA body.
///
/// Currently models Java's `setXPathVariableResolver` only — the only
/// language-level resolver-binding API for XPath in the existing
/// detection corpus. PHP's `DOMXPath::registerPhpFunctions()` is a
/// different mechanism (PHP function registration) and not modelled
/// here.
pub fn analyze_xpath_config(body: &SsaBody, cfg: &Cfg, lang: Option<Lang>) -> XPathConfigResult {
let Some(lang) = lang else {
return XPathConfigResult::default();
};
if !matches!(lang, Lang::Java) {
return XPathConfigResult::default();
}
let mut configs: HashMap<SsaValue, XPathReceiverConfig> = HashMap::new();
// Pass 1 — direct effects from Call instructions in source order.
// `setXPathVariableResolver` updates the call's receiver in place;
// any non-null argument is treated as a resolver binding. Argument
// null-check would require a const-prop fact, but the conservative
// direction here is to assume the bound value is non-null (matches the
// XML parser-config setter semantics).
for block in &body.blocks {
for inst in block.body.iter() {
if let SsaOp::Call {
callee, receiver, ..
} = &inst.op
{
let suffix = callee.rsplit(['.', ':']).next().unwrap_or(callee);
if suffix == "setXPathVariableResolver"
&& let Some(rv) = receiver
{
let entry = configs.entry(*rv).or_default();
entry.has_resolver = true;
}
}
}
}
if configs.is_empty() {
return XPathConfigResult::default();
}
// Pass 2 — fixed-point propagation through copy assignments and
// phi joins. Caps the iteration count: in practice 2-3 rounds
// suffice on intra-procedural shapes.
let _ = cfg; // CFG retained for parity with `xml_config`; reserved for
// future kwarg-driven seeds (e.g. constructor options).
for _ in 0..6 {
let mut changed = false;
for block in &body.blocks {
for inst in &block.phis {
if let SsaOp::Phi(operands) = &inst.op {
let mut acc: Option<XPathReceiverConfig> = None;
for (_, val) in operands {
let cfg_val = configs.get(val).copied().unwrap_or_default();
acc = Some(match acc {
None => cfg_val,
Some(prev) => prev.meet(&cfg_val),
});
}
if let Some(joined) = acc
&& joined != XPathReceiverConfig::default()
{
let prev = configs.get(&inst.value).copied();
if prev != Some(joined) {
configs.insert(inst.value, joined);
changed = true;
}
}
}
}
for inst in &block.body {
if let SsaOp::Assign(uses) = &inst.op
&& uses.len() == 1
&& let Some(src_cfg) = configs.get(&uses[0]).copied()
&& src_cfg != XPathReceiverConfig::default()
{
let prev = configs.get(&inst.value).copied().unwrap_or_default();
let new_cfg = prev.union(&src_cfg);
if Some(new_cfg) != configs.get(&inst.value).copied() {
configs.insert(inst.value, new_cfg);
changed = true;
}
}
}
}
if !changed {
break;
}
}
XPathConfigResult { configs }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn default_config_is_unparameterised() {
let c = XPathReceiverConfig::default();
assert!(!c.is_parameterised());
}
#[test]
fn has_resolver_marks_parameterised() {
let c = XPathReceiverConfig { has_resolver: true };
assert!(c.is_parameterised());
}
#[test]
fn meet_keeps_intersection() {
let a = XPathReceiverConfig { has_resolver: true };
let b = XPathReceiverConfig {
has_resolver: false,
};
let m = a.meet(&b);
assert!(!m.has_resolver);
}
#[test]
fn meet_both_set_keeps_set() {
let a = XPathReceiverConfig { has_resolver: true };
let b = XPathReceiverConfig { has_resolver: true };
let m = a.meet(&b);
assert!(m.has_resolver);
}
#[test]
fn xpath_safe_returns_false_without_receiver() {
let result = XPathConfigResult::default();
assert!(!xpath_safe(None, &result));
}
#[test]
fn xpath_safe_uses_receiver_config() {
let mut configs = HashMap::new();
configs.insert(SsaValue(7), XPathReceiverConfig { has_resolver: true });
let result = XPathConfigResult { configs };
assert!(xpath_safe(Some(SsaValue(7)), &result));
assert!(!xpath_safe(Some(SsaValue(8)), &result));
}
}