[pitboss] phase 05: Track J.3 + Track L.3 — XXE corpus + DocumentBuilder / lxml / libxml / SimpleXML adapters

This commit is contained in:
pitboss 2026-05-17 20:39:12 -05:00
parent 637b733928
commit 4de925c3ef
35 changed files with 1985 additions and 23 deletions

View file

@ -0,0 +1,25 @@
// Phase 05 (Track J.3) — Go XXE benign fixture.
//
// Same parser surface as `vuln.go` but `Strict` is left at the
// default `true`, so the doctype is rejected and no entity body is
// substituted.
package benign
import (
"bytes"
"encoding/xml"
)
type Data struct {
XMLName xml.Name `xml:"data"`
Value string `xml:",chardata"`
}
func Run(body string) (*Data, error) {
d := xml.NewDecoder(bytes.NewReader([]byte(body)))
out := &Data{}
if err := d.Decode(out); err != nil {
return nil, err
}
return out, nil
}

View file

@ -0,0 +1,27 @@
// Phase 05 (Track J.3) — Go XXE vuln fixture.
//
// The function builds an `encoding/xml.Decoder` against the attacker
// payload with `Strict: false` so the doctype is parsed and any
// `<!ENTITY xxe SYSTEM "file:///…">` in the payload is resolved and
// substituted into element values.
package vuln
import (
"bytes"
"encoding/xml"
)
type Data struct {
XMLName xml.Name `xml:"data"`
Value string `xml:",chardata"`
}
func Run(body string) (*Data, error) {
d := xml.NewDecoder(bytes.NewReader([]byte(body)))
d.Strict = false
out := &Data{}
if err := d.Decode(out); err != nil {
return nil, err
}
return out, nil
}

View file

@ -0,0 +1,18 @@
// Phase 05 (Track J.3) Java XXE benign fixture.
//
// Same parser surface as `vuln.java` but the factory is hardened with
// `disallow-doctype-decl`, so the same payload's `<!ENTITY>` block is
// rejected at parse time and no entity body is substituted.
import java.io.ByteArrayInputStream;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
public class Benign {
public static Document run(byte[] payload) throws Exception {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
DocumentBuilder builder = factory.newDocumentBuilder();
return builder.parse(new ByteArrayInputStream(payload));
}
}

View file

@ -0,0 +1,19 @@
// Phase 05 (Track J.3) Java XXE vuln fixture.
//
// The function feeds attacker bytes to a stock `DocumentBuilderFactory`
// without setting `disallow-doctype-decl` / `XMLConstants.FEATURE_
// SECURE_PROCESSING`, so any `<!ENTITY xxe SYSTEM "file:///…">`
// declaration in the payload is resolved and its body substituted
// into the parsed tree.
import java.io.ByteArrayInputStream;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
public class Vuln {
public static Document run(byte[] payload) throws Exception {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
return builder.parse(new ByteArrayInputStream(payload));
}
}

View file

@ -0,0 +1,10 @@
<?php
// Phase 05 (Track J.3) — PHP XXE benign fixture.
//
// Same parser surface as `vuln.php` but the entity loader stays
// disabled and the LIBXML_NOENT flag is omitted, so the same payload's
// `<!ENTITY>` block is rejected and no entity body is substituted.
function run(string $body) {
libxml_disable_entity_loader(true);
return simplexml_load_string($body);
}

View file

@ -0,0 +1,11 @@
<?php
// Phase 05 (Track J.3) — PHP XXE vuln fixture.
//
// The function pulls XML off the request and feeds it to
// `simplexml_load_string` after re-enabling the libxml entity loader
// — so any `<!ENTITY xxe SYSTEM "file:///…">` in the payload is
// resolved and its body substituted into the parsed document.
function run(string $body) {
libxml_disable_entity_loader(false);
return simplexml_load_string($body, "SimpleXMLElement", LIBXML_NOENT);
}

View file

@ -0,0 +1,12 @@
"""Phase 05 (Track J.3) — Python XXE benign fixture.
Same parser surface as `vuln.py` but the parser is configured with
`resolve_entities=False` and `no_network=True`, so the same payload's
`<!ENTITY>` block is rejected and no entity body is substituted.
"""
from lxml import etree
def run(body: bytes):
parser = etree.XMLParser(resolve_entities=False, no_network=True)
return etree.fromstring(body, parser=parser)

View file

@ -0,0 +1,13 @@
"""Phase 05 (Track J.3) — Python XXE vuln fixture.
The function pulls XML bytes off the request and feeds them straight
to `lxml.etree.XMLParser(resolve_entities=True)`, so any
`<!ENTITY xxe SYSTEM "file:///…">` in the payload is resolved and its
body substituted into the parsed tree.
"""
from lxml import etree
def run(body: bytes):
parser = etree.XMLParser(resolve_entities=True)
return etree.fromstring(body, parser=parser)

View file

@ -0,0 +1,11 @@
# Phase 05 (Track J.3) — Ruby XXE benign fixture.
#
# Same parser surface as `vuln.rb` but the document is built under
# `REXML::Document::entity_expansion_limit = 0`, so the same payload's
# `<!ENTITY>` block triggers no expansion.
require 'rexml/document'
def run(body)
REXML::Document.entity_expansion_limit = 0
REXML::Document.new(body)
end

View file

@ -0,0 +1,11 @@
# Phase 05 (Track J.3) — Ruby XXE vuln fixture.
#
# The function feeds attacker XML straight to `REXML::Document.new`
# without disabling entity expansion, so any `<!ENTITY xxe SYSTEM
# "file:///…">` in the payload is resolved and its body substituted
# into the parsed document.
require 'rexml/document'
def run(body)
REXML::Document.new(body)
end

294
tests/xxe_corpus.rs Normal file
View file

@ -0,0 +1,294 @@
//! Phase 05 (Track J.3) — XXE corpus acceptance.
//!
//! Asserts the new cap end-to-end: corpus slices register per-engine
//! vuln/benign pairs for Java / Python / PHP / Ruby / Go, the
//! lang-aware resolver pairs them inside the correct slice, the
//! per-language harness emitters splice in the synthetic XML parser +
//! entity-expansion probe + sink-hit sentinel, and the framework
//! adapters fire on the canonical sink call.
//!
//! `cargo nextest run --features dynamic --test xxe_corpus`.
#![cfg(feature = "dynamic")]
use nyx_scanner::dynamic::corpus::{
audit_marker_collisions, benign_payload_for_lang, payloads_for_lang,
resolve_benign_control_lang, Oracle,
};
use nyx_scanner::dynamic::framework::registry::adapters_for;
use nyx_scanner::dynamic::lang;
use nyx_scanner::dynamic::oracle::ProbePredicate;
use nyx_scanner::dynamic::probe::ProbeKind;
use nyx_scanner::dynamic::spec::{EntryKind, HarnessSpec, PayloadSlot};
use nyx_scanner::labels::Cap;
use nyx_scanner::summary::FuncSummary;
use nyx_scanner::symbol::Lang;
const LANGS: &[Lang] = &[Lang::Java, Lang::Python, Lang::Php, Lang::Ruby, Lang::Go];
fn make_spec(lang: Lang, entry_file: &str, entry_name: &str) -> HarnessSpec {
HarnessSpec {
finding_id: "phase05test0001".into(),
entry_file: entry_file.into(),
entry_name: entry_name.into(),
entry_kind: EntryKind::Function,
lang,
toolchain_id: "phase05".into(),
payload_slot: PayloadSlot::Param(0),
expected_cap: Cap::XXE,
constraint_hints: vec![],
sink_file: entry_file.into(),
sink_line: 1,
spec_hash: "phase05test0001".into(),
derivation: nyx_scanner::dynamic::spec::SpecDerivationStrategy::FromFlowSteps,
stubs_required: vec![],
framework: None,
}
}
#[test]
fn corpus_registers_xxe_for_every_supported_lang() {
for lang in LANGS {
let slice = payloads_for_lang(Cap::XXE, *lang);
assert!(!slice.is_empty(), "XXE has no payloads for {lang:?}");
let has_vuln = slice.iter().any(|p| !p.is_benign);
let has_benign = slice.iter().any(|p| p.is_benign);
assert!(has_vuln, "{lang:?} XXE missing vuln payload");
assert!(has_benign, "{lang:?} XXE missing benign control");
}
}
#[test]
fn xxe_unsupported_caps_unchanged_for_other_langs() {
// Phase 05 only fills Java / Python / PHP / Ruby / Go — Rust / C
// / Cpp / JS / TS stay empty.
for lang in [
Lang::Rust,
Lang::C,
Lang::Cpp,
Lang::JavaScript,
Lang::TypeScript,
] {
assert!(
payloads_for_lang(Cap::XXE, lang).is_empty(),
"unexpected XXE payloads registered for {lang:?}",
);
}
}
#[test]
fn benign_control_resolves_within_lang_slice() {
for lang in LANGS {
let slice = payloads_for_lang(Cap::XXE, *lang);
let vuln = slice.iter().find(|p| !p.is_benign).unwrap();
let resolved =
resolve_benign_control_lang(vuln, Cap::XXE, *lang).expect("paired control");
assert!(resolved.is_benign);
let direct = benign_payload_for_lang(Cap::XXE, *lang).unwrap();
assert_eq!(direct.label, resolved.label);
}
}
#[test]
fn payload_oracle_carries_xxe_entity_expanded_predicate() {
for lang in LANGS {
let slice = payloads_for_lang(Cap::XXE, *lang);
let vuln = slice.iter().find(|p| !p.is_benign).unwrap();
match &vuln.oracle {
Oracle::SinkProbe { predicates } => {
assert!(
predicates.iter().any(|p| matches!(
p,
ProbePredicate::XxeEntityExpanded { require_expanded: true }
)),
"{lang:?} vuln payload missing XxeEntityExpanded{{require_expanded:true}}",
);
}
other => panic!("expected SinkProbe oracle for {lang:?}, got {other:?}"),
}
}
}
#[test]
fn vuln_payload_bytes_contain_doctype_entity_declaration() {
// The whole differential rule rests on the vuln payload carrying
// an `<!ENTITY … SYSTEM "…">` decl and the benign control NOT
// carrying one — pin both invariants so a future corpus tweak
// does not silently break the oracle.
for lang in LANGS {
let slice = payloads_for_lang(Cap::XXE, *lang);
let vuln = slice.iter().find(|p| !p.is_benign).unwrap();
let benign = slice.iter().find(|p| p.is_benign).unwrap();
let vuln_text = std::str::from_utf8(vuln.bytes).unwrap();
let benign_text = std::str::from_utf8(benign.bytes).unwrap();
assert!(
vuln_text.contains("<!ENTITY") && vuln_text.contains("SYSTEM"),
"{lang:?} vuln payload must declare a SYSTEM entity",
);
assert!(
!benign_text.contains("<!ENTITY"),
"{lang:?} benign control must not declare an entity",
);
}
}
#[test]
fn marker_collisions_clean_with_phase_05_additions() {
assert!(audit_marker_collisions().is_empty());
}
#[test]
fn probe_kind_xxe_serdes() {
let original = ProbeKind::Xxe {
entity_expanded: true,
};
let json = serde_json::to_string(&original).unwrap();
assert!(json.contains("Xxe"));
assert!(json.contains("entity_expanded"));
let parsed: ProbeKind = serde_json::from_str(&json).unwrap();
assert_eq!(parsed, original);
}
#[test]
fn lang_emitter_dispatches_to_xxe_harness() {
// Per-lang `sink_callee_marker` pins which parser-construction
// string the harness names in its probe record — the
// `DocumentBuilder.parse` / `lxml.etree.XMLParser` /
// `simplexml_load_string` / `REXML::Document.new` /
// `xml.Decoder.Decode` boundary the brief calls out.
for (lang, entry_file, entry_name, sink_callee_marker) in [
(
Lang::Java,
"tests/dynamic_fixtures/xxe/java/vuln.java",
"run",
"DocumentBuilder.parse",
),
(
Lang::Python,
"tests/dynamic_fixtures/xxe/python/vuln.py",
"run",
"lxml.etree.XMLParser.parse",
),
(
Lang::Php,
"tests/dynamic_fixtures/xxe/php/vuln.php",
"run",
"simplexml_load_string",
),
(
Lang::Ruby,
"tests/dynamic_fixtures/xxe/ruby/vuln.rb",
"run",
"REXML::Document.new",
),
(
Lang::Go,
"tests/dynamic_fixtures/xxe/go/vuln.go",
"Run",
"xml.Decoder.Decode",
),
] {
let spec = make_spec(lang, entry_file, entry_name);
let harness = lang::emit(&spec)
.unwrap_or_else(|e| panic!("emit failed for {lang:?}: {e:?}"));
assert!(
harness.source.contains("entity_expanded"),
"{lang:?} xxe harness must carry the entity_expanded probe field",
);
assert!(
harness.source.contains(sink_callee_marker),
"{lang:?} xxe harness must name {sink_callee_marker:?} as the parser sink callee",
);
assert!(
harness.source.contains("__NYX_SINK_HIT__"),
"{lang:?} xxe harness must emit the sink-hit sentinel",
);
assert!(
harness.source.contains("<!ENTITY") || harness.source.contains("ENTITY"),
"{lang:?} xxe harness must include the entity-detection scanner",
);
}
}
#[test]
fn framework_adapters_detect_xxe_sink() {
// Each lang registers its J.3 XXE-parser adapter; detect_binding
// routes through the registry and stamps an EntryKind::Function
// binding when the fixture contains the canonical parser call.
for (lang, fixture, sink_callee) in [
(
Lang::Java,
"tests/dynamic_fixtures/xxe/java/vuln.java",
"parse",
),
(
Lang::Python,
"tests/dynamic_fixtures/xxe/python/vuln.py",
"fromstring",
),
(
Lang::Php,
"tests/dynamic_fixtures/xxe/php/vuln.php",
"simplexml_load_string",
),
(
Lang::Ruby,
"tests/dynamic_fixtures/xxe/ruby/vuln.rb",
"new",
),
(
Lang::Go,
"tests/dynamic_fixtures/xxe/go/vuln.go",
"NewDecoder",
),
] {
let bytes = std::fs::read(fixture).expect("fixture exists");
let ts_lang = ts_language_for(lang);
let mut parser = tree_sitter::Parser::new();
parser.set_language(&ts_lang).unwrap();
let tree = parser.parse(&bytes, None).unwrap();
let mut summary = FuncSummary {
name: "run".into(),
file_path: fixture.to_owned(),
lang: slug(lang).into(),
..Default::default()
};
summary
.callees
.push(nyx_scanner::summary::CalleeSite::bare(sink_callee));
let registry_slice = adapters_for(lang);
assert!(!registry_slice.is_empty(), "{lang:?} adapter slice empty");
let binding = nyx_scanner::dynamic::framework::detect_binding(
&summary,
tree.root_node(),
&bytes,
lang,
);
let b = binding
.unwrap_or_else(|| panic!("{lang:?} adapter must detect the XXE fixture"));
assert_eq!(b.kind, EntryKind::Function);
assert!(!b.adapter.is_empty());
}
}
fn ts_language_for(lang: Lang) -> tree_sitter::Language {
match lang {
Lang::Java => tree_sitter::Language::from(tree_sitter_java::LANGUAGE),
Lang::Python => tree_sitter::Language::from(tree_sitter_python::LANGUAGE),
Lang::Php => tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP),
Lang::Ruby => tree_sitter::Language::from(tree_sitter_ruby::LANGUAGE),
Lang::Go => tree_sitter::Language::from(tree_sitter_go::LANGUAGE),
other => panic!("unsupported test lang {other:?}"),
}
}
fn slug(lang: Lang) -> &'static str {
match lang {
Lang::Java => "java",
Lang::Python => "python",
Lang::Php => "php",
Lang::Ruby => "ruby",
Lang::Go => "go",
_ => "other",
}
}