[pitboss] phase 05: Track J.3 + Track L.3 — XXE corpus + DocumentBuilder / lxml / libxml / SimpleXML adapters

This commit is contained in:
pitboss 2026-05-17 20:39:12 -05:00
parent 637b733928
commit 4de925c3ef
35 changed files with 1985 additions and 23 deletions

View file

@ -20,6 +20,11 @@ pub mod python_jinja2;
pub mod python_pickle;
pub mod ruby_erb;
pub mod ruby_marshal;
pub mod xxe_go;
pub mod xxe_java;
pub mod xxe_php;
pub mod xxe_python;
pub mod xxe_ruby;
pub use java_deserialize::JavaDeserializeAdapter;
pub use java_thymeleaf::JavaThymeleafAdapter;
@ -30,6 +35,11 @@ pub use python_jinja2::PythonJinja2Adapter;
pub use python_pickle::PythonPickleAdapter;
pub use ruby_erb::RubyErbAdapter;
pub use ruby_marshal::RubyMarshalAdapter;
pub use xxe_go::XxeGoAdapter;
pub use xxe_java::XxeJavaAdapter;
pub use xxe_php::XxePhpAdapter;
pub use xxe_python::XxePythonAdapter;
pub use xxe_ruby::XxeRubyAdapter;
/// True when any callee in `summary.callees` matches `predicate`.
fn any_callee_matches(

View file

@ -0,0 +1,113 @@
//! Go [`super::super::FrameworkAdapter`] matching XXE-prone
//! `encoding/xml` parser constructions.
//!
//! Phase 05 (Track J.3). Fires when the function body invokes one of
//! the canonical `encoding/xml` entry points (`xml.NewDecoder`,
//! `xml.Unmarshal`, `Decoder.Decode`) and the surrounding source
//! mentions the `encoding/xml` import — the brief specifically calls
//! out `xml.Decoder` with `Strict: false` as the XXE-prone shape.
use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding};
use crate::evidence::EntryKind;
use crate::summary::FuncSummary;
use crate::symbol::Lang;
pub struct XxeGoAdapter;
const ADAPTER_NAME: &str = "xxe-go";
fn callee_is_xml_parser(name: &str) -> bool {
let last = name.rsplit_once('.').map(|(_, s)| s).unwrap_or(name);
matches!(
last,
"NewDecoder" | "Unmarshal" | "Decode" | "DecodeElement"
)
}
fn source_imports_xml(file_bytes: &[u8]) -> bool {
const NEEDLES: &[&[u8]] = &[
b"encoding/xml",
b"xml.NewDecoder",
b"xml.Unmarshal",
b"xml.Decoder",
];
NEEDLES
.iter()
.any(|n| file_bytes.windows(n.len()).any(|w| w == *n))
}
impl FrameworkAdapter for XxeGoAdapter {
fn name(&self) -> &'static str {
ADAPTER_NAME
}
fn lang(&self) -> Lang {
Lang::Go
}
fn detect(
&self,
summary: &FuncSummary,
_ast: tree_sitter::Node<'_>,
file_bytes: &[u8],
) -> Option<FrameworkBinding> {
let matches_call = super::any_callee_matches(summary, callee_is_xml_parser);
let matches_source = source_imports_xml(file_bytes);
if matches_call && matches_source {
Some(FrameworkBinding {
adapter: ADAPTER_NAME.to_owned(),
kind: EntryKind::Function,
route: None,
request_params: Vec::new(),
response_writer: None,
middleware: Vec::new(),
})
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_go(src: &[u8]) -> tree_sitter::Tree {
let mut parser = tree_sitter::Parser::new();
let lang = tree_sitter::Language::from(tree_sitter_go::LANGUAGE);
parser.set_language(&lang).unwrap();
parser.parse(src, None).unwrap()
}
#[test]
fn fires_on_xml_new_decoder() {
let src: &[u8] = b"package main\nimport (\"bytes\"; \"encoding/xml\")\n\
func Run(body string) {\n\
d := xml.NewDecoder(bytes.NewReader([]byte(body)))\n\
d.Strict = false\n\
_ = d.Decode(&struct{}{})\n\
}\n";
let tree = parse_go(src);
let summary = FuncSummary {
name: "Run".into(),
callees: vec![crate::summary::CalleeSite::bare("NewDecoder")],
..Default::default()
};
assert!(XxeGoAdapter
.detect(&summary, tree.root_node(), src)
.is_some());
}
#[test]
fn skips_plain_function() {
let src: &[u8] = b"package main\nfunc Add(a, b int) int { return a + b }\n";
let tree = parse_go(src);
let summary = FuncSummary {
name: "Add".into(),
..Default::default()
};
assert!(XxeGoAdapter
.detect(&summary, tree.root_node(), src)
.is_none());
}
}

View file

@ -0,0 +1,139 @@
//! Java [`super::super::FrameworkAdapter`] matching XXE-prone XML parser
//! constructions.
//!
//! Phase 05 (Track J.3). Fires when the function body invokes a
//! `DocumentBuilder.parse` / `SAXParser.parse` / `XMLInputFactory`
//! call site and the surrounding source pulls in one of the
//! `javax.xml.parsers` / `org.w3c.dom` / `org.xml.sax` packages —
//! i.e. an XML parser that, by default and without
//! `disallow-doctype-decl`, expands external entities.
use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding};
use crate::evidence::EntryKind;
use crate::summary::FuncSummary;
use crate::symbol::Lang;
pub struct XxeJavaAdapter;
const ADAPTER_NAME: &str = "xxe-java";
fn callee_is_xml_parse(name: &str) -> bool {
let last = name.rsplit_once('.').map(|(_, s)| s).unwrap_or(name);
matches!(
last,
"parse"
| "newDocumentBuilder"
| "newSAXParser"
| "createXMLEventReader"
| "createXMLStreamReader"
| "newInstance"
)
}
fn source_imports_xml_parser(file_bytes: &[u8]) -> bool {
const NEEDLES: &[&[u8]] = &[
b"javax.xml.parsers",
b"DocumentBuilderFactory",
b"DocumentBuilder",
b"SAXParserFactory",
b"XMLInputFactory",
b"org.xml.sax",
b"org.w3c.dom",
];
NEEDLES
.iter()
.any(|n| file_bytes.windows(n.len()).any(|w| w == *n))
}
impl FrameworkAdapter for XxeJavaAdapter {
fn name(&self) -> &'static str {
ADAPTER_NAME
}
fn lang(&self) -> Lang {
Lang::Java
}
fn detect(
&self,
summary: &FuncSummary,
_ast: tree_sitter::Node<'_>,
file_bytes: &[u8],
) -> Option<FrameworkBinding> {
let matches_call = super::any_callee_matches(summary, callee_is_xml_parse);
let matches_source = source_imports_xml_parser(file_bytes);
if matches_call && matches_source {
return Some(FrameworkBinding {
adapter: ADAPTER_NAME.to_owned(),
kind: EntryKind::Function,
route: None,
request_params: Vec::new(),
response_writer: None,
middleware: Vec::new(),
});
}
// Fall-back: source clearly imports the XXE-prone parser even
// when the call-graph summary did not capture the parse call.
if matches_source
&& file_bytes
.windows(b".parse(".len())
.any(|w| w == b".parse(")
{
return Some(FrameworkBinding {
adapter: ADAPTER_NAME.to_owned(),
kind: EntryKind::Function,
route: None,
request_params: Vec::new(),
response_writer: None,
middleware: Vec::new(),
});
}
None
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_java(src: &[u8]) -> tree_sitter::Tree {
let mut parser = tree_sitter::Parser::new();
let lang = tree_sitter::Language::from(tree_sitter_java::LANGUAGE);
parser.set_language(&lang).unwrap();
parser.parse(src, None).unwrap()
}
#[test]
fn fires_on_document_builder_parse() {
let src: &[u8] = b"import javax.xml.parsers.DocumentBuilderFactory;\n\
public class V {\n public static void run(byte[] b) throws Exception {\n\
DocumentBuilderFactory f = DocumentBuilderFactory.newInstance();\n\
f.newDocumentBuilder().parse(new java.io.ByteArrayInputStream(b));\n\
}\n}\n";
let tree = parse_java(src);
let summary = FuncSummary {
name: "run".into(),
callees: vec![crate::summary::CalleeSite::bare("parse")],
..Default::default()
};
let binding = XxeJavaAdapter
.detect(&summary, tree.root_node(), src)
.expect("must fire on DocumentBuilder.parse fixture");
assert_eq!(binding.adapter, ADAPTER_NAME);
assert_eq!(binding.kind, EntryKind::Function);
}
#[test]
fn skips_plain_function() {
let src: &[u8] =
b"public class V { public static void run(String b) { System.out.println(b); } }\n";
let tree = parse_java(src);
let summary = FuncSummary {
name: "run".into(),
..Default::default()
};
assert!(XxeJavaAdapter
.detect(&summary, tree.root_node(), src)
.is_none());
}
}

View file

@ -0,0 +1,120 @@
//! PHP [`super::super::FrameworkAdapter`] matching XXE-prone XML
//! parser constructions.
//!
//! Phase 05 (Track J.3). Fires when the function body invokes one of
//! the canonical PHP XML entry points (`simplexml_load_string`,
//! `simplexml_load_file`, `DOMDocument::loadXML`,
//! `DOMDocument::load`, `xml_parser_create`) and the surrounding
//! source mentions an XML / libxml symbol — the parser, by default
//! and under `libxml_disable_entity_loader(false)`, expands external
//! entities.
use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding};
use crate::evidence::EntryKind;
use crate::summary::FuncSummary;
use crate::symbol::Lang;
pub struct XxePhpAdapter;
const ADAPTER_NAME: &str = "xxe-php";
fn callee_is_xml_parser(name: &str) -> bool {
let last = name.rsplit_once("::").map(|(_, s)| s)
.or_else(|| name.rsplit_once('.').map(|(_, s)| s))
.or_else(|| name.rsplit_once("->").map(|(_, s)| s))
.unwrap_or(name);
matches!(
last,
"simplexml_load_string"
| "simplexml_load_file"
| "loadXML"
| "load"
| "xml_parser_create"
| "xml_parse"
)
}
fn source_imports_xml(file_bytes: &[u8]) -> bool {
const NEEDLES: &[&[u8]] = &[
b"simplexml_load_string",
b"simplexml_load_file",
b"DOMDocument",
b"xml_parser_create",
b"libxml_disable_entity_loader",
b"LIBXML_NOENT",
];
NEEDLES
.iter()
.any(|n| file_bytes.windows(n.len()).any(|w| w == *n))
}
impl FrameworkAdapter for XxePhpAdapter {
fn name(&self) -> &'static str {
ADAPTER_NAME
}
fn lang(&self) -> Lang {
Lang::Php
}
fn detect(
&self,
summary: &FuncSummary,
_ast: tree_sitter::Node<'_>,
file_bytes: &[u8],
) -> Option<FrameworkBinding> {
let matches_call = super::any_callee_matches(summary, callee_is_xml_parser);
let matches_source = source_imports_xml(file_bytes);
if matches_call || matches_source {
Some(FrameworkBinding {
adapter: ADAPTER_NAME.to_owned(),
kind: EntryKind::Function,
route: None,
request_params: Vec::new(),
response_writer: None,
middleware: Vec::new(),
})
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_php(src: &[u8]) -> tree_sitter::Tree {
let mut parser = tree_sitter::Parser::new();
let lang = tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP);
parser.set_language(&lang).unwrap();
parser.parse(src, None).unwrap()
}
#[test]
fn fires_on_simplexml_load_string() {
let src: &[u8] = b"<?php\nfunction run($body) {\n return simplexml_load_string($body);\n}\n";
let tree = parse_php(src);
let summary = FuncSummary {
name: "run".into(),
callees: vec![crate::summary::CalleeSite::bare("simplexml_load_string")],
..Default::default()
};
assert!(XxePhpAdapter
.detect(&summary, tree.root_node(), src)
.is_some());
}
#[test]
fn skips_plain_function() {
let src: &[u8] = b"<?php\nfunction add($a, $b) { return $a + $b; }\n";
let tree = parse_php(src);
let summary = FuncSummary {
name: "add".into(),
..Default::default()
};
assert!(XxePhpAdapter
.detect(&summary, tree.root_node(), src)
.is_none());
}
}

View file

@ -0,0 +1,120 @@
//! Python [`super::super::FrameworkAdapter`] matching XXE-prone XML
//! parser constructions.
//!
//! Phase 05 (Track J.3). Fires when the function body invokes one of
//! the canonical lxml / stdlib XML entry points
//! (`lxml.etree.XMLParser`, `lxml.etree.parse`, `lxml.etree.fromstring`,
//! `xml.etree.ElementTree.parse`, `xml.sax.parse`,
//! `xml.dom.minidom.parseString`) and the surrounding source mentions
//! the matching module. Callee matching is last-segment-aware so
//! receiver-prefixed calls (`etree.XMLParser`,
//! `ElementTree.fromstring`) hit the same predicate.
use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding};
use crate::evidence::EntryKind;
use crate::summary::FuncSummary;
use crate::symbol::Lang;
pub struct XxePythonAdapter;
const ADAPTER_NAME: &str = "xxe-python";
fn callee_is_xml_parser(name: &str) -> bool {
let last = name.rsplit_once('.').map(|(_, s)| s).unwrap_or(name);
matches!(
last,
"XMLParser"
| "parse"
| "fromstring"
| "parseString"
| "XMLPullParser"
| "iterparse"
)
}
fn source_imports_xml(file_bytes: &[u8]) -> bool {
const NEEDLES: &[&[u8]] = &[
b"lxml.etree",
b"lxml import",
b"xml.etree",
b"ElementTree",
b"xml.sax",
b"xml.dom",
b"defusedxml",
];
NEEDLES
.iter()
.any(|n| file_bytes.windows(n.len()).any(|w| w == *n))
}
impl FrameworkAdapter for XxePythonAdapter {
fn name(&self) -> &'static str {
ADAPTER_NAME
}
fn lang(&self) -> Lang {
Lang::Python
}
fn detect(
&self,
summary: &FuncSummary,
_ast: tree_sitter::Node<'_>,
file_bytes: &[u8],
) -> Option<FrameworkBinding> {
let matches_call = super::any_callee_matches(summary, callee_is_xml_parser);
let matches_source = source_imports_xml(file_bytes);
if matches_call && matches_source {
Some(FrameworkBinding {
adapter: ADAPTER_NAME.to_owned(),
kind: EntryKind::Function,
route: None,
request_params: Vec::new(),
response_writer: None,
middleware: Vec::new(),
})
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_python(src: &[u8]) -> tree_sitter::Tree {
let mut parser = tree_sitter::Parser::new();
let lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE);
parser.set_language(&lang).unwrap();
parser.parse(src, None).unwrap()
}
#[test]
fn fires_on_lxml_etree_fromstring() {
let src: &[u8] = b"from lxml import etree\n\
def run(body):\n return etree.fromstring(body)\n";
let tree = parse_python(src);
let summary = FuncSummary {
name: "run".into(),
callees: vec![crate::summary::CalleeSite::bare("fromstring")],
..Default::default()
};
assert!(XxePythonAdapter
.detect(&summary, tree.root_node(), src)
.is_some());
}
#[test]
fn skips_plain_function() {
let src: &[u8] = b"def add(a, b):\n return a + b\n";
let tree = parse_python(src);
let summary = FuncSummary {
name: "add".into(),
..Default::default()
};
assert!(XxePythonAdapter
.detect(&summary, tree.root_node(), src)
.is_none());
}
}

View file

@ -0,0 +1,109 @@
//! Ruby [`super::super::FrameworkAdapter`] matching XXE-prone XML
//! parser constructions.
//!
//! Phase 05 (Track J.3). Fires when the function body invokes one of
//! the canonical Ruby XML entry points
//! (`REXML::Document.new`, `Nokogiri::XML`, `Nokogiri::XML::Document.parse`,
//! `Ox.parse`) and the surrounding source mentions the matching
//! library.
use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding};
use crate::evidence::EntryKind;
use crate::summary::FuncSummary;
use crate::symbol::Lang;
pub struct XxeRubyAdapter;
const ADAPTER_NAME: &str = "xxe-ruby";
fn callee_is_xml_parser(name: &str) -> bool {
let last = name.rsplit_once("::").map(|(_, s)| s)
.or_else(|| name.rsplit_once('.').map(|(_, s)| s))
.unwrap_or(name);
matches!(last, "new" | "parse" | "XML" | "load")
}
fn source_imports_xml(file_bytes: &[u8]) -> bool {
const NEEDLES: &[&[u8]] = &[
b"REXML",
b"rexml/document",
b"Nokogiri",
b"nokogiri",
b"Ox.parse",
];
NEEDLES
.iter()
.any(|n| file_bytes.windows(n.len()).any(|w| w == *n))
}
impl FrameworkAdapter for XxeRubyAdapter {
fn name(&self) -> &'static str {
ADAPTER_NAME
}
fn lang(&self) -> Lang {
Lang::Ruby
}
fn detect(
&self,
summary: &FuncSummary,
_ast: tree_sitter::Node<'_>,
file_bytes: &[u8],
) -> Option<FrameworkBinding> {
let matches_call = super::any_callee_matches(summary, callee_is_xml_parser);
let matches_source = source_imports_xml(file_bytes);
if matches_call && matches_source {
Some(FrameworkBinding {
adapter: ADAPTER_NAME.to_owned(),
kind: EntryKind::Function,
route: None,
request_params: Vec::new(),
response_writer: None,
middleware: Vec::new(),
})
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_ruby(src: &[u8]) -> tree_sitter::Tree {
let mut parser = tree_sitter::Parser::new();
let lang = tree_sitter::Language::from(tree_sitter_ruby::LANGUAGE);
parser.set_language(&lang).unwrap();
parser.parse(src, None).unwrap()
}
#[test]
fn fires_on_rexml_document_new() {
let src: &[u8] = b"require 'rexml/document'\n\
def run(body)\n REXML::Document.new(body)\nend\n";
let tree = parse_ruby(src);
let summary = FuncSummary {
name: "run".into(),
callees: vec![crate::summary::CalleeSite::bare("new")],
..Default::default()
};
assert!(XxeRubyAdapter
.detect(&summary, tree.root_node(), src)
.is_some());
}
#[test]
fn skips_plain_function() {
let src: &[u8] = b"def add(a, b)\n a + b\nend\n";
let tree = parse_ruby(src);
let summary = FuncSummary {
name: "add".into(),
..Default::default()
};
assert!(XxeRubyAdapter
.detect(&summary, tree.root_node(), src)
.is_none());
}
}

View file

@ -214,17 +214,19 @@ mod tests {
}
#[test]
fn registry_baseline_after_phase_04() {
// Phase 04 (Track J.2) adds the SSTI-sink adapter alongside the
// Phase-03 deserialize adapter for Java / Python / PHP / Ruby and
// introduces the first JavaScript adapter (Handlebars). Other
// languages still carry the Phase-01 empty baseline.
fn registry_baseline_after_phase_05() {
// Phase 05 (Track J.3) adds the XXE-sink adapter alongside the
// Phase-03 deserialize + Phase-04 SSTI adapters for Java /
// Python / PHP / Ruby, and introduces the first Go adapter
// (xxe-go). JavaScript still has only the Handlebars adapter;
// Rust / C / Cpp / TypeScript still carry the Phase-01 empty
// baseline.
for lang in [Lang::Java, Lang::Python, Lang::Php, Lang::Ruby] {
let registered = registry::adapters_for(lang);
assert_eq!(
registered.len(),
2,
"{:?} must have the J.1 deserialize + J.2 ssti adapters",
3,
"{:?} must have the J.1 deserialize + J.2 ssti + J.3 xxe adapters",
lang,
);
for adapter in registered {
@ -238,13 +240,14 @@ mod tests {
"JavaScript must have exactly the J.2 Handlebars adapter",
);
assert_eq!(js_registered[0].lang(), Lang::JavaScript);
for lang in [
Lang::Rust,
Lang::C,
Lang::Cpp,
Lang::Go,
Lang::TypeScript,
] {
let go_registered = registry::adapters_for(Lang::Go);
assert_eq!(
go_registered.len(),
1,
"Go must have exactly the J.3 xxe-go adapter",
);
assert_eq!(go_registered[0].lang(), Lang::Go);
for lang in [Lang::Rust, Lang::C, Lang::Cpp, Lang::TypeScript] {
assert!(
registry::adapters_for(lang).is_empty(),
"{:?} should still have zero adapters before its Track-L phase",

View file

@ -50,19 +50,23 @@ static CPP: &[&dyn FrameworkAdapter] = &[];
static JAVA: &[&dyn FrameworkAdapter] = &[
&super::adapters::JavaDeserializeAdapter,
&super::adapters::JavaThymeleafAdapter,
&super::adapters::XxeJavaAdapter,
];
static GO: &[&dyn FrameworkAdapter] = &[];
static GO: &[&dyn FrameworkAdapter] = &[&super::adapters::XxeGoAdapter];
static PHP: &[&dyn FrameworkAdapter] = &[
&super::adapters::PhpTwigAdapter,
&super::adapters::PhpUnserializeAdapter,
&super::adapters::XxePhpAdapter,
];
static PYTHON: &[&dyn FrameworkAdapter] = &[
&super::adapters::PythonJinja2Adapter,
&super::adapters::PythonPickleAdapter,
&super::adapters::XxePythonAdapter,
];
static RUBY: &[&dyn FrameworkAdapter] = &[
&super::adapters::RubyErbAdapter,
&super::adapters::RubyMarshalAdapter,
&super::adapters::XxeRubyAdapter,
];
static TYPESCRIPT: &[&dyn FrameworkAdapter] = &[];
static JAVASCRIPT: &[&dyn FrameworkAdapter] = &[&super::adapters::JsHandlebarsAdapter];