[pitboss] phase 05: Track J.3 + Track L.3 — XXE corpus + DocumentBuilder / lxml / libxml / SimpleXML adapters

This commit is contained in:
pitboss 2026-05-17 20:39:12 -05:00
parent 637b733928
commit 4de925c3ef
35 changed files with 1985 additions and 23 deletions

View file

@ -55,6 +55,7 @@ mod sqli;
mod ssrf;
mod ssti;
mod xss;
mod xxe;
pub use registry::{
audit_marker_collisions, benign_payload_for, benign_payload_for_lang, materialise_bytes,
@ -86,7 +87,8 @@ pub use crate::dynamic::oracle::Oracle;
/// | 6 | 2026-05-17 | Phase 02 / Track J.0: `(Cap, Lang)` registry refactor; `no_benign_control_rationale` field; compile-time provenance audit |
/// | 7 | 2026-05-17 | Phase 03 / Track J.1: `DESERIALIZE` cap lit for Java / Python / PHP / Ruby; `ProbeKind::Deserialize` + `ProbePredicate::DeserializeGadgetInvoked` |
/// | 8 | 2026-05-17 | Phase 04 / Track J.2: `SSTI` cap lit for Jinja2 / ERB / Twig / Thymeleaf / Handlebars; `ProbePredicate::TemplateEvalEqual` |
pub const CORPUS_VERSION: u32 = 8;
/// | 9 | 2026-05-17 | Phase 05 / Track J.3: `XXE` cap lit for Java / Python / PHP / Ruby / Go; `ProbeKind::Xxe` + `ProbePredicate::XxeEntityExpanded` |
pub const CORPUS_VERSION: u32 = 9;
/// Where a payload originated.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]

View file

@ -23,7 +23,7 @@
use std::collections::HashMap;
use std::sync::OnceLock;
use super::{cmdi, deserialize, fmt_string, path_trav, sqli, ssrf, ssti, xss};
use super::{cmdi, deserialize, fmt_string, path_trav, sqli, ssrf, ssti, xss, xxe};
use super::{CapCorpus, CuratedPayload, Oracle};
use crate::dynamic::oracle::ProbePredicate;
use crate::labels::Cap;
@ -44,7 +44,6 @@ pub const CORPUS_UNSUPPORTED_LANG_NEUTRAL: u32 = Cap::ENV_VAR.bits()
| Cap::XPATH_INJECTION.bits()
| Cap::HEADER_INJECTION.bits()
| Cap::OPEN_REDIRECT.bits()
| Cap::XXE.bits()
| Cap::PROTOTYPE_POLLUTION.bits();
/// Flat `(Cap, Lang, slice)` table. A single cap can carry per-language
@ -65,6 +64,11 @@ const ENTRIES: &[(Cap, Lang, &[CuratedPayload])] = &[
(Cap::SSTI, Lang::Php, ssti::php_twig::PAYLOADS),
(Cap::SSTI, Lang::Java, ssti::java_thymeleaf::PAYLOADS),
(Cap::SSTI, Lang::JavaScript, ssti::js_handlebars::PAYLOADS),
(Cap::XXE, Lang::Java, xxe::java::PAYLOADS),
(Cap::XXE, Lang::Python, xxe::python::PAYLOADS),
(Cap::XXE, Lang::Php, xxe::php::PAYLOADS),
(Cap::XXE, Lang::Ruby, xxe::ruby::PAYLOADS),
(Cap::XXE, Lang::Go, xxe::go::PAYLOADS),
];
/// Reserved for per-cap oracle defaults. Empty in Phase 02; populated by
@ -273,6 +277,7 @@ mod tests {
assert!(!payloads_for(Cap::FMT_STRING).is_empty());
assert!(!payloads_for(Cap::DESERIALIZE).is_empty());
assert!(!payloads_for(Cap::SSTI).is_empty());
assert!(!payloads_for(Cap::XXE).is_empty());
}
#[test]
@ -289,7 +294,6 @@ mod tests {
Cap::XPATH_INJECTION,
Cap::HEADER_INJECTION,
Cap::OPEN_REDIRECT,
Cap::XXE,
Cap::PROTOTYPE_POLLUTION,
];
for cap in unsupported {
@ -320,6 +324,7 @@ mod tests {
Cap::FMT_STRING,
Cap::DESERIALIZE,
Cap::SSTI,
Cap::XXE,
] {
let has_vuln = payloads_for(cap).iter().any(|p| !p.is_benign);
assert!(has_vuln, "{cap:?} must have at least one vuln payload");
@ -368,6 +373,7 @@ mod tests {
Cap::FMT_STRING,
Cap::DESERIALIZE,
Cap::SSTI,
Cap::XXE,
];
for cap in caps {
for p in payloads_for(cap) {
@ -391,6 +397,7 @@ mod tests {
Cap::FMT_STRING,
Cap::DESERIALIZE,
Cap::SSTI,
Cap::XXE,
];
for cap in caps {
for p in payloads_for(cap) {
@ -501,6 +508,7 @@ mod tests {
Cap::FMT_STRING,
Cap::DESERIALIZE,
Cap::SSTI,
Cap::XXE,
];
for cap in caps {
for p in payloads_for(cap).iter().filter(|p| p.is_benign) {
@ -629,6 +637,46 @@ mod tests {
}
}
#[test]
fn xxe_has_per_lang_slices_for_phase_05() {
// Phase 05 (Track J.3) acceptance: XXE registers payloads in
// Java / Python / PHP / Ruby / Go and the lang-aware lookup
// never returns empty for any of them.
for lang in [Lang::Java, Lang::Python, Lang::Php, Lang::Ruby, Lang::Go] {
assert!(
!payloads_for_lang(Cap::XXE, lang).is_empty(),
"XXE must have at least one payload for {lang:?}",
);
}
// Rust / C / Cpp / JS / TS not yet covered.
for lang in [
Lang::Rust,
Lang::C,
Lang::Cpp,
Lang::JavaScript,
Lang::TypeScript,
] {
assert!(
payloads_for_lang(Cap::XXE, lang).is_empty(),
"XXE has unexpected payloads for {lang:?}",
);
}
}
#[test]
fn xxe_payloads_pair_benign_controls_per_lang() {
for lang in [Lang::Java, Lang::Python, Lang::Php, Lang::Ruby, Lang::Go] {
let slice = payloads_for_lang(Cap::XXE, lang);
let vuln = slice
.iter()
.find(|p| !p.is_benign)
.expect("each lang must have an XXE vuln payload");
let resolved = super::resolve_benign_control_lang(vuln, Cap::XXE, lang)
.expect("lang-aware benign control must resolve");
assert!(resolved.is_benign);
}
}
#[test]
fn deserialize_payloads_pair_benign_controls_per_lang() {
// The lang-aware resolver must find the paired benign control

View file

@ -0,0 +1,66 @@
//! Go `Cap::XXE` payloads — `encoding/xml.Decoder` with `Strict: false`.
//!
//! Vuln payload: an XML document declaring an external entity that
//! the harness's instrumented `xml.Decoder` (running non-strict so
//! the doctype is parsed at all) expands inside `<data>`; the shim
//! writes `ProbeKind::Xxe { entity_expanded: true }` once it sees the
//! entity body substitute into the decoded element value.
//!
//! Benign control: a well-formed XML document with no doctype, so the
//! decoder has no entity to resolve and the shim writes
//! `entity_expanded: false`.
use super::super::{CuratedPayload, Oracle, PayloadProvenance, PayloadRef};
use crate::dynamic::oracle::ProbePredicate;
pub const PAYLOADS: &[CuratedPayload] = &[
CuratedPayload {
bytes: br#"<?xml version="1.0"?>
<!DOCTYPE data [
<!ENTITY xxe SYSTEM "file:///etc/hostname">
]>
<data>&xxe;</data>"#,
label: "xxe-go-doctype-entity",
oracle: Oracle::SinkProbe {
predicates: &[ProbePredicate::XxeEntityExpanded {
require_expanded: true,
}],
},
is_benign: false,
provenance: PayloadProvenance::Curated,
since_corpus_version: 9,
deprecated_at_corpus_version: None,
fixture_paths: &[
"tests/dynamic_fixtures/xxe/go/vuln.go",
],
oob_nonce_slot: false,
probe_predicates: &[ProbePredicate::XxeEntityExpanded {
require_expanded: true,
}],
benign_control: Some(PayloadRef {
label: "xxe-go-benign",
}),
no_benign_control_rationale: None,
},
CuratedPayload {
bytes: br#"<?xml version="1.0"?>
<data>hello</data>"#,
label: "xxe-go-benign",
oracle: Oracle::SinkProbe {
predicates: &[ProbePredicate::XxeEntityExpanded {
require_expanded: true,
}],
},
is_benign: true,
provenance: PayloadProvenance::Curated,
since_corpus_version: 9,
deprecated_at_corpus_version: None,
fixture_paths: &[
"tests/dynamic_fixtures/xxe/go/benign.go",
],
oob_nonce_slot: false,
probe_predicates: &[],
benign_control: None,
no_benign_control_rationale: None,
},
];

View file

@ -0,0 +1,67 @@
//! Java `Cap::XXE` payloads — `DocumentBuilderFactory` / `SAXParser`.
//!
//! Vuln payload: an XML document declaring an external entity that
//! the harness's instrumented `DocumentBuilder.parse` resolves and
//! substitutes inside `<data>` — the parser writes a
//! `ProbeKind::Xxe { entity_expanded: true }` record once it sees the
//! entity body materialise.
//!
//! Benign control: a well-formed XML document with no doctype
//! declaration so the parser has no entity to resolve. The harness's
//! instrumented parser writes `entity_expanded: false`, the oracle
//! does not fire, and the differential rule (§4.1) stays clean.
use super::super::{CuratedPayload, Oracle, PayloadProvenance, PayloadRef};
use crate::dynamic::oracle::ProbePredicate;
pub const PAYLOADS: &[CuratedPayload] = &[
CuratedPayload {
bytes: br#"<?xml version="1.0"?>
<!DOCTYPE data [
<!ENTITY xxe SYSTEM "file:///etc/hostname">
]>
<data>&xxe;</data>"#,
label: "xxe-java-doctype-entity",
oracle: Oracle::SinkProbe {
predicates: &[ProbePredicate::XxeEntityExpanded {
require_expanded: true,
}],
},
is_benign: false,
provenance: PayloadProvenance::Curated,
since_corpus_version: 9,
deprecated_at_corpus_version: None,
fixture_paths: &[
"tests/dynamic_fixtures/xxe/java/vuln.java",
],
oob_nonce_slot: false,
probe_predicates: &[ProbePredicate::XxeEntityExpanded {
require_expanded: true,
}],
benign_control: Some(PayloadRef {
label: "xxe-java-benign",
}),
no_benign_control_rationale: None,
},
CuratedPayload {
bytes: br#"<?xml version="1.0"?>
<data>hello</data>"#,
label: "xxe-java-benign",
oracle: Oracle::SinkProbe {
predicates: &[ProbePredicate::XxeEntityExpanded {
require_expanded: true,
}],
},
is_benign: true,
provenance: PayloadProvenance::Curated,
since_corpus_version: 9,
deprecated_at_corpus_version: None,
fixture_paths: &[
"tests/dynamic_fixtures/xxe/java/benign.java",
],
oob_nonce_slot: false,
probe_predicates: &[],
benign_control: None,
no_benign_control_rationale: None,
},
];

View file

@ -0,0 +1,24 @@
//! XML External Entity expansion (`Cap::XXE`) per-language payload slices.
//!
//! Phase 05 (Track J.3) carves XXE across the five most-common XML
//! parser stacks: Java (`DocumentBuilderFactory`), Python
//! (`lxml.etree.XMLParser`), PHP (`simplexml_load_string` under
//! `libxml_disable_entity_loader(false)`), Ruby (REXML / Nokogiri), and
//! Go (`encoding/xml.Decoder`). Every vuln payload ships an XML
//! document declaring an external entity (`<!ENTITY xxe SYSTEM "…">`)
//! that the engine expands inside an element body. The paired benign
//! control omits the doctype + entity so the parser has nothing to
//! resolve; the oracle's
//! [`crate::dynamic::oracle::ProbePredicate::XxeEntityExpanded`] check
//! satisfies on the vuln run (`entity_expanded: true`) and stays clear
//! on the benign run, fulfilling the §4.1 differential rule.
//!
//! C# is intentionally omitted: the [`crate::symbol::Lang`] enum has
//! no `CSharp` variant, so the corpus has nowhere to register it.
//! Tracked in `.pitboss/play/deferred.md`.
pub mod go;
pub mod java;
pub mod php;
pub mod python;
pub mod ruby;

View file

@ -0,0 +1,66 @@
//! PHP `Cap::XXE` payloads — `simplexml_load_string` under
//! `libxml_disable_entity_loader(false)`.
//!
//! Vuln payload: an XML document declaring an external entity that
//! the harness's instrumented parser expands inside `<data>`; the
//! shim writes `ProbeKind::Xxe { entity_expanded: true }` once it
//! sees the entity body substitute into the parsed output.
//!
//! Benign control: a well-formed XML document with no doctype, so
//! the parser has no entity to resolve and the shim writes
//! `entity_expanded: false`.
use super::super::{CuratedPayload, Oracle, PayloadProvenance, PayloadRef};
use crate::dynamic::oracle::ProbePredicate;
pub const PAYLOADS: &[CuratedPayload] = &[
CuratedPayload {
bytes: br#"<?xml version="1.0"?>
<!DOCTYPE data [
<!ENTITY xxe SYSTEM "file:///etc/hostname">
]>
<data>&xxe;</data>"#,
label: "xxe-php-doctype-entity",
oracle: Oracle::SinkProbe {
predicates: &[ProbePredicate::XxeEntityExpanded {
require_expanded: true,
}],
},
is_benign: false,
provenance: PayloadProvenance::Curated,
since_corpus_version: 9,
deprecated_at_corpus_version: None,
fixture_paths: &[
"tests/dynamic_fixtures/xxe/php/vuln.php",
],
oob_nonce_slot: false,
probe_predicates: &[ProbePredicate::XxeEntityExpanded {
require_expanded: true,
}],
benign_control: Some(PayloadRef {
label: "xxe-php-benign",
}),
no_benign_control_rationale: None,
},
CuratedPayload {
bytes: br#"<?xml version="1.0"?>
<data>hello</data>"#,
label: "xxe-php-benign",
oracle: Oracle::SinkProbe {
predicates: &[ProbePredicate::XxeEntityExpanded {
require_expanded: true,
}],
},
is_benign: true,
provenance: PayloadProvenance::Curated,
since_corpus_version: 9,
deprecated_at_corpus_version: None,
fixture_paths: &[
"tests/dynamic_fixtures/xxe/php/benign.php",
],
oob_nonce_slot: false,
probe_predicates: &[],
benign_control: None,
no_benign_control_rationale: None,
},
];

View file

@ -0,0 +1,66 @@
//! Python `Cap::XXE` payloads — `lxml.etree.XMLParser(resolve_entities=True)`.
//!
//! Vuln payload: an XML document declaring an external entity that
//! the harness's instrumented parser (`resolve_entities=True`)
//! expands inside `<data>`; the shim writes
//! `ProbeKind::Xxe { entity_expanded: true }` once it sees the entity
//! body substitute into the parsed tree.
//!
//! Benign control: a well-formed XML document with no doctype, so the
//! parser has nothing to resolve and the shim writes
//! `entity_expanded: false`.
use super::super::{CuratedPayload, Oracle, PayloadProvenance, PayloadRef};
use crate::dynamic::oracle::ProbePredicate;
pub const PAYLOADS: &[CuratedPayload] = &[
CuratedPayload {
bytes: br#"<?xml version="1.0"?>
<!DOCTYPE data [
<!ENTITY xxe SYSTEM "file:///etc/hostname">
]>
<data>&xxe;</data>"#,
label: "xxe-python-doctype-entity",
oracle: Oracle::SinkProbe {
predicates: &[ProbePredicate::XxeEntityExpanded {
require_expanded: true,
}],
},
is_benign: false,
provenance: PayloadProvenance::Curated,
since_corpus_version: 9,
deprecated_at_corpus_version: None,
fixture_paths: &[
"tests/dynamic_fixtures/xxe/python/vuln.py",
],
oob_nonce_slot: false,
probe_predicates: &[ProbePredicate::XxeEntityExpanded {
require_expanded: true,
}],
benign_control: Some(PayloadRef {
label: "xxe-python-benign",
}),
no_benign_control_rationale: None,
},
CuratedPayload {
bytes: br#"<?xml version="1.0"?>
<data>hello</data>"#,
label: "xxe-python-benign",
oracle: Oracle::SinkProbe {
predicates: &[ProbePredicate::XxeEntityExpanded {
require_expanded: true,
}],
},
is_benign: true,
provenance: PayloadProvenance::Curated,
since_corpus_version: 9,
deprecated_at_corpus_version: None,
fixture_paths: &[
"tests/dynamic_fixtures/xxe/python/benign.py",
],
oob_nonce_slot: false,
probe_predicates: &[],
benign_control: None,
no_benign_control_rationale: None,
},
];

View file

@ -0,0 +1,65 @@
//! Ruby `Cap::XXE` payloads — REXML / Nokogiri document parsers.
//!
//! Vuln payload: an XML document declaring an external entity that
//! the harness's instrumented parser expands inside `<data>`; the
//! shim writes `ProbeKind::Xxe { entity_expanded: true }` once it
//! sees the entity body substitute into the parsed output.
//!
//! Benign control: a well-formed XML document with no doctype, so
//! the parser has no entity to resolve and the shim writes
//! `entity_expanded: false`.
use super::super::{CuratedPayload, Oracle, PayloadProvenance, PayloadRef};
use crate::dynamic::oracle::ProbePredicate;
pub const PAYLOADS: &[CuratedPayload] = &[
CuratedPayload {
bytes: br#"<?xml version="1.0"?>
<!DOCTYPE data [
<!ENTITY xxe SYSTEM "file:///etc/hostname">
]>
<data>&xxe;</data>"#,
label: "xxe-ruby-doctype-entity",
oracle: Oracle::SinkProbe {
predicates: &[ProbePredicate::XxeEntityExpanded {
require_expanded: true,
}],
},
is_benign: false,
provenance: PayloadProvenance::Curated,
since_corpus_version: 9,
deprecated_at_corpus_version: None,
fixture_paths: &[
"tests/dynamic_fixtures/xxe/ruby/vuln.rb",
],
oob_nonce_slot: false,
probe_predicates: &[ProbePredicate::XxeEntityExpanded {
require_expanded: true,
}],
benign_control: Some(PayloadRef {
label: "xxe-ruby-benign",
}),
no_benign_control_rationale: None,
},
CuratedPayload {
bytes: br#"<?xml version="1.0"?>
<data>hello</data>"#,
label: "xxe-ruby-benign",
oracle: Oracle::SinkProbe {
predicates: &[ProbePredicate::XxeEntityExpanded {
require_expanded: true,
}],
},
is_benign: true,
provenance: PayloadProvenance::Curated,
since_corpus_version: 9,
deprecated_at_corpus_version: None,
fixture_paths: &[
"tests/dynamic_fixtures/xxe/ruby/benign.rb",
],
oob_nonce_slot: false,
probe_predicates: &[],
benign_control: None,
no_benign_control_rationale: None,
},
];

View file

@ -20,6 +20,11 @@ pub mod python_jinja2;
pub mod python_pickle;
pub mod ruby_erb;
pub mod ruby_marshal;
pub mod xxe_go;
pub mod xxe_java;
pub mod xxe_php;
pub mod xxe_python;
pub mod xxe_ruby;
pub use java_deserialize::JavaDeserializeAdapter;
pub use java_thymeleaf::JavaThymeleafAdapter;
@ -30,6 +35,11 @@ pub use python_jinja2::PythonJinja2Adapter;
pub use python_pickle::PythonPickleAdapter;
pub use ruby_erb::RubyErbAdapter;
pub use ruby_marshal::RubyMarshalAdapter;
pub use xxe_go::XxeGoAdapter;
pub use xxe_java::XxeJavaAdapter;
pub use xxe_php::XxePhpAdapter;
pub use xxe_python::XxePythonAdapter;
pub use xxe_ruby::XxeRubyAdapter;
/// True when any callee in `summary.callees` matches `predicate`.
fn any_callee_matches(

View file

@ -0,0 +1,113 @@
//! Go [`super::super::FrameworkAdapter`] matching XXE-prone
//! `encoding/xml` parser constructions.
//!
//! Phase 05 (Track J.3). Fires when the function body invokes one of
//! the canonical `encoding/xml` entry points (`xml.NewDecoder`,
//! `xml.Unmarshal`, `Decoder.Decode`) and the surrounding source
//! mentions the `encoding/xml` import — the brief specifically calls
//! out `xml.Decoder` with `Strict: false` as the XXE-prone shape.
use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding};
use crate::evidence::EntryKind;
use crate::summary::FuncSummary;
use crate::symbol::Lang;
pub struct XxeGoAdapter;
const ADAPTER_NAME: &str = "xxe-go";
fn callee_is_xml_parser(name: &str) -> bool {
let last = name.rsplit_once('.').map(|(_, s)| s).unwrap_or(name);
matches!(
last,
"NewDecoder" | "Unmarshal" | "Decode" | "DecodeElement"
)
}
fn source_imports_xml(file_bytes: &[u8]) -> bool {
const NEEDLES: &[&[u8]] = &[
b"encoding/xml",
b"xml.NewDecoder",
b"xml.Unmarshal",
b"xml.Decoder",
];
NEEDLES
.iter()
.any(|n| file_bytes.windows(n.len()).any(|w| w == *n))
}
impl FrameworkAdapter for XxeGoAdapter {
fn name(&self) -> &'static str {
ADAPTER_NAME
}
fn lang(&self) -> Lang {
Lang::Go
}
fn detect(
&self,
summary: &FuncSummary,
_ast: tree_sitter::Node<'_>,
file_bytes: &[u8],
) -> Option<FrameworkBinding> {
let matches_call = super::any_callee_matches(summary, callee_is_xml_parser);
let matches_source = source_imports_xml(file_bytes);
if matches_call && matches_source {
Some(FrameworkBinding {
adapter: ADAPTER_NAME.to_owned(),
kind: EntryKind::Function,
route: None,
request_params: Vec::new(),
response_writer: None,
middleware: Vec::new(),
})
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_go(src: &[u8]) -> tree_sitter::Tree {
let mut parser = tree_sitter::Parser::new();
let lang = tree_sitter::Language::from(tree_sitter_go::LANGUAGE);
parser.set_language(&lang).unwrap();
parser.parse(src, None).unwrap()
}
#[test]
fn fires_on_xml_new_decoder() {
let src: &[u8] = b"package main\nimport (\"bytes\"; \"encoding/xml\")\n\
func Run(body string) {\n\
d := xml.NewDecoder(bytes.NewReader([]byte(body)))\n\
d.Strict = false\n\
_ = d.Decode(&struct{}{})\n\
}\n";
let tree = parse_go(src);
let summary = FuncSummary {
name: "Run".into(),
callees: vec![crate::summary::CalleeSite::bare("NewDecoder")],
..Default::default()
};
assert!(XxeGoAdapter
.detect(&summary, tree.root_node(), src)
.is_some());
}
#[test]
fn skips_plain_function() {
let src: &[u8] = b"package main\nfunc Add(a, b int) int { return a + b }\n";
let tree = parse_go(src);
let summary = FuncSummary {
name: "Add".into(),
..Default::default()
};
assert!(XxeGoAdapter
.detect(&summary, tree.root_node(), src)
.is_none());
}
}

View file

@ -0,0 +1,139 @@
//! Java [`super::super::FrameworkAdapter`] matching XXE-prone XML parser
//! constructions.
//!
//! Phase 05 (Track J.3). Fires when the function body invokes a
//! `DocumentBuilder.parse` / `SAXParser.parse` / `XMLInputFactory`
//! call site and the surrounding source pulls in one of the
//! `javax.xml.parsers` / `org.w3c.dom` / `org.xml.sax` packages —
//! i.e. an XML parser that, by default and without
//! `disallow-doctype-decl`, expands external entities.
use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding};
use crate::evidence::EntryKind;
use crate::summary::FuncSummary;
use crate::symbol::Lang;
pub struct XxeJavaAdapter;
const ADAPTER_NAME: &str = "xxe-java";
fn callee_is_xml_parse(name: &str) -> bool {
let last = name.rsplit_once('.').map(|(_, s)| s).unwrap_or(name);
matches!(
last,
"parse"
| "newDocumentBuilder"
| "newSAXParser"
| "createXMLEventReader"
| "createXMLStreamReader"
| "newInstance"
)
}
fn source_imports_xml_parser(file_bytes: &[u8]) -> bool {
const NEEDLES: &[&[u8]] = &[
b"javax.xml.parsers",
b"DocumentBuilderFactory",
b"DocumentBuilder",
b"SAXParserFactory",
b"XMLInputFactory",
b"org.xml.sax",
b"org.w3c.dom",
];
NEEDLES
.iter()
.any(|n| file_bytes.windows(n.len()).any(|w| w == *n))
}
impl FrameworkAdapter for XxeJavaAdapter {
fn name(&self) -> &'static str {
ADAPTER_NAME
}
fn lang(&self) -> Lang {
Lang::Java
}
fn detect(
&self,
summary: &FuncSummary,
_ast: tree_sitter::Node<'_>,
file_bytes: &[u8],
) -> Option<FrameworkBinding> {
let matches_call = super::any_callee_matches(summary, callee_is_xml_parse);
let matches_source = source_imports_xml_parser(file_bytes);
if matches_call && matches_source {
return Some(FrameworkBinding {
adapter: ADAPTER_NAME.to_owned(),
kind: EntryKind::Function,
route: None,
request_params: Vec::new(),
response_writer: None,
middleware: Vec::new(),
});
}
// Fall-back: source clearly imports the XXE-prone parser even
// when the call-graph summary did not capture the parse call.
if matches_source
&& file_bytes
.windows(b".parse(".len())
.any(|w| w == b".parse(")
{
return Some(FrameworkBinding {
adapter: ADAPTER_NAME.to_owned(),
kind: EntryKind::Function,
route: None,
request_params: Vec::new(),
response_writer: None,
middleware: Vec::new(),
});
}
None
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_java(src: &[u8]) -> tree_sitter::Tree {
let mut parser = tree_sitter::Parser::new();
let lang = tree_sitter::Language::from(tree_sitter_java::LANGUAGE);
parser.set_language(&lang).unwrap();
parser.parse(src, None).unwrap()
}
#[test]
fn fires_on_document_builder_parse() {
let src: &[u8] = b"import javax.xml.parsers.DocumentBuilderFactory;\n\
public class V {\n public static void run(byte[] b) throws Exception {\n\
DocumentBuilderFactory f = DocumentBuilderFactory.newInstance();\n\
f.newDocumentBuilder().parse(new java.io.ByteArrayInputStream(b));\n\
}\n}\n";
let tree = parse_java(src);
let summary = FuncSummary {
name: "run".into(),
callees: vec![crate::summary::CalleeSite::bare("parse")],
..Default::default()
};
let binding = XxeJavaAdapter
.detect(&summary, tree.root_node(), src)
.expect("must fire on DocumentBuilder.parse fixture");
assert_eq!(binding.adapter, ADAPTER_NAME);
assert_eq!(binding.kind, EntryKind::Function);
}
#[test]
fn skips_plain_function() {
let src: &[u8] =
b"public class V { public static void run(String b) { System.out.println(b); } }\n";
let tree = parse_java(src);
let summary = FuncSummary {
name: "run".into(),
..Default::default()
};
assert!(XxeJavaAdapter
.detect(&summary, tree.root_node(), src)
.is_none());
}
}

View file

@ -0,0 +1,120 @@
//! PHP [`super::super::FrameworkAdapter`] matching XXE-prone XML
//! parser constructions.
//!
//! Phase 05 (Track J.3). Fires when the function body invokes one of
//! the canonical PHP XML entry points (`simplexml_load_string`,
//! `simplexml_load_file`, `DOMDocument::loadXML`,
//! `DOMDocument::load`, `xml_parser_create`) and the surrounding
//! source mentions an XML / libxml symbol — the parser, by default
//! and under `libxml_disable_entity_loader(false)`, expands external
//! entities.
use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding};
use crate::evidence::EntryKind;
use crate::summary::FuncSummary;
use crate::symbol::Lang;
pub struct XxePhpAdapter;
const ADAPTER_NAME: &str = "xxe-php";
fn callee_is_xml_parser(name: &str) -> bool {
let last = name.rsplit_once("::").map(|(_, s)| s)
.or_else(|| name.rsplit_once('.').map(|(_, s)| s))
.or_else(|| name.rsplit_once("->").map(|(_, s)| s))
.unwrap_or(name);
matches!(
last,
"simplexml_load_string"
| "simplexml_load_file"
| "loadXML"
| "load"
| "xml_parser_create"
| "xml_parse"
)
}
fn source_imports_xml(file_bytes: &[u8]) -> bool {
const NEEDLES: &[&[u8]] = &[
b"simplexml_load_string",
b"simplexml_load_file",
b"DOMDocument",
b"xml_parser_create",
b"libxml_disable_entity_loader",
b"LIBXML_NOENT",
];
NEEDLES
.iter()
.any(|n| file_bytes.windows(n.len()).any(|w| w == *n))
}
impl FrameworkAdapter for XxePhpAdapter {
fn name(&self) -> &'static str {
ADAPTER_NAME
}
fn lang(&self) -> Lang {
Lang::Php
}
fn detect(
&self,
summary: &FuncSummary,
_ast: tree_sitter::Node<'_>,
file_bytes: &[u8],
) -> Option<FrameworkBinding> {
let matches_call = super::any_callee_matches(summary, callee_is_xml_parser);
let matches_source = source_imports_xml(file_bytes);
if matches_call || matches_source {
Some(FrameworkBinding {
adapter: ADAPTER_NAME.to_owned(),
kind: EntryKind::Function,
route: None,
request_params: Vec::new(),
response_writer: None,
middleware: Vec::new(),
})
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_php(src: &[u8]) -> tree_sitter::Tree {
let mut parser = tree_sitter::Parser::new();
let lang = tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP);
parser.set_language(&lang).unwrap();
parser.parse(src, None).unwrap()
}
#[test]
fn fires_on_simplexml_load_string() {
let src: &[u8] = b"<?php\nfunction run($body) {\n return simplexml_load_string($body);\n}\n";
let tree = parse_php(src);
let summary = FuncSummary {
name: "run".into(),
callees: vec![crate::summary::CalleeSite::bare("simplexml_load_string")],
..Default::default()
};
assert!(XxePhpAdapter
.detect(&summary, tree.root_node(), src)
.is_some());
}
#[test]
fn skips_plain_function() {
let src: &[u8] = b"<?php\nfunction add($a, $b) { return $a + $b; }\n";
let tree = parse_php(src);
let summary = FuncSummary {
name: "add".into(),
..Default::default()
};
assert!(XxePhpAdapter
.detect(&summary, tree.root_node(), src)
.is_none());
}
}

View file

@ -0,0 +1,120 @@
//! Python [`super::super::FrameworkAdapter`] matching XXE-prone XML
//! parser constructions.
//!
//! Phase 05 (Track J.3). Fires when the function body invokes one of
//! the canonical lxml / stdlib XML entry points
//! (`lxml.etree.XMLParser`, `lxml.etree.parse`, `lxml.etree.fromstring`,
//! `xml.etree.ElementTree.parse`, `xml.sax.parse`,
//! `xml.dom.minidom.parseString`) and the surrounding source mentions
//! the matching module. Callee matching is last-segment-aware so
//! receiver-prefixed calls (`etree.XMLParser`,
//! `ElementTree.fromstring`) hit the same predicate.
use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding};
use crate::evidence::EntryKind;
use crate::summary::FuncSummary;
use crate::symbol::Lang;
pub struct XxePythonAdapter;
const ADAPTER_NAME: &str = "xxe-python";
fn callee_is_xml_parser(name: &str) -> bool {
let last = name.rsplit_once('.').map(|(_, s)| s).unwrap_or(name);
matches!(
last,
"XMLParser"
| "parse"
| "fromstring"
| "parseString"
| "XMLPullParser"
| "iterparse"
)
}
fn source_imports_xml(file_bytes: &[u8]) -> bool {
const NEEDLES: &[&[u8]] = &[
b"lxml.etree",
b"lxml import",
b"xml.etree",
b"ElementTree",
b"xml.sax",
b"xml.dom",
b"defusedxml",
];
NEEDLES
.iter()
.any(|n| file_bytes.windows(n.len()).any(|w| w == *n))
}
impl FrameworkAdapter for XxePythonAdapter {
fn name(&self) -> &'static str {
ADAPTER_NAME
}
fn lang(&self) -> Lang {
Lang::Python
}
fn detect(
&self,
summary: &FuncSummary,
_ast: tree_sitter::Node<'_>,
file_bytes: &[u8],
) -> Option<FrameworkBinding> {
let matches_call = super::any_callee_matches(summary, callee_is_xml_parser);
let matches_source = source_imports_xml(file_bytes);
if matches_call && matches_source {
Some(FrameworkBinding {
adapter: ADAPTER_NAME.to_owned(),
kind: EntryKind::Function,
route: None,
request_params: Vec::new(),
response_writer: None,
middleware: Vec::new(),
})
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_python(src: &[u8]) -> tree_sitter::Tree {
let mut parser = tree_sitter::Parser::new();
let lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE);
parser.set_language(&lang).unwrap();
parser.parse(src, None).unwrap()
}
#[test]
fn fires_on_lxml_etree_fromstring() {
let src: &[u8] = b"from lxml import etree\n\
def run(body):\n return etree.fromstring(body)\n";
let tree = parse_python(src);
let summary = FuncSummary {
name: "run".into(),
callees: vec![crate::summary::CalleeSite::bare("fromstring")],
..Default::default()
};
assert!(XxePythonAdapter
.detect(&summary, tree.root_node(), src)
.is_some());
}
#[test]
fn skips_plain_function() {
let src: &[u8] = b"def add(a, b):\n return a + b\n";
let tree = parse_python(src);
let summary = FuncSummary {
name: "add".into(),
..Default::default()
};
assert!(XxePythonAdapter
.detect(&summary, tree.root_node(), src)
.is_none());
}
}

View file

@ -0,0 +1,109 @@
//! Ruby [`super::super::FrameworkAdapter`] matching XXE-prone XML
//! parser constructions.
//!
//! Phase 05 (Track J.3). Fires when the function body invokes one of
//! the canonical Ruby XML entry points
//! (`REXML::Document.new`, `Nokogiri::XML`, `Nokogiri::XML::Document.parse`,
//! `Ox.parse`) and the surrounding source mentions the matching
//! library.
use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding};
use crate::evidence::EntryKind;
use crate::summary::FuncSummary;
use crate::symbol::Lang;
pub struct XxeRubyAdapter;
const ADAPTER_NAME: &str = "xxe-ruby";
fn callee_is_xml_parser(name: &str) -> bool {
let last = name.rsplit_once("::").map(|(_, s)| s)
.or_else(|| name.rsplit_once('.').map(|(_, s)| s))
.unwrap_or(name);
matches!(last, "new" | "parse" | "XML" | "load")
}
fn source_imports_xml(file_bytes: &[u8]) -> bool {
const NEEDLES: &[&[u8]] = &[
b"REXML",
b"rexml/document",
b"Nokogiri",
b"nokogiri",
b"Ox.parse",
];
NEEDLES
.iter()
.any(|n| file_bytes.windows(n.len()).any(|w| w == *n))
}
impl FrameworkAdapter for XxeRubyAdapter {
fn name(&self) -> &'static str {
ADAPTER_NAME
}
fn lang(&self) -> Lang {
Lang::Ruby
}
fn detect(
&self,
summary: &FuncSummary,
_ast: tree_sitter::Node<'_>,
file_bytes: &[u8],
) -> Option<FrameworkBinding> {
let matches_call = super::any_callee_matches(summary, callee_is_xml_parser);
let matches_source = source_imports_xml(file_bytes);
if matches_call && matches_source {
Some(FrameworkBinding {
adapter: ADAPTER_NAME.to_owned(),
kind: EntryKind::Function,
route: None,
request_params: Vec::new(),
response_writer: None,
middleware: Vec::new(),
})
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_ruby(src: &[u8]) -> tree_sitter::Tree {
let mut parser = tree_sitter::Parser::new();
let lang = tree_sitter::Language::from(tree_sitter_ruby::LANGUAGE);
parser.set_language(&lang).unwrap();
parser.parse(src, None).unwrap()
}
#[test]
fn fires_on_rexml_document_new() {
let src: &[u8] = b"require 'rexml/document'\n\
def run(body)\n REXML::Document.new(body)\nend\n";
let tree = parse_ruby(src);
let summary = FuncSummary {
name: "run".into(),
callees: vec![crate::summary::CalleeSite::bare("new")],
..Default::default()
};
assert!(XxeRubyAdapter
.detect(&summary, tree.root_node(), src)
.is_some());
}
#[test]
fn skips_plain_function() {
let src: &[u8] = b"def add(a, b)\n a + b\nend\n";
let tree = parse_ruby(src);
let summary = FuncSummary {
name: "add".into(),
..Default::default()
};
assert!(XxeRubyAdapter
.detect(&summary, tree.root_node(), src)
.is_none());
}
}

View file

@ -214,17 +214,19 @@ mod tests {
}
#[test]
fn registry_baseline_after_phase_04() {
// Phase 04 (Track J.2) adds the SSTI-sink adapter alongside the
// Phase-03 deserialize adapter for Java / Python / PHP / Ruby and
// introduces the first JavaScript adapter (Handlebars). Other
// languages still carry the Phase-01 empty baseline.
fn registry_baseline_after_phase_05() {
// Phase 05 (Track J.3) adds the XXE-sink adapter alongside the
// Phase-03 deserialize + Phase-04 SSTI adapters for Java /
// Python / PHP / Ruby, and introduces the first Go adapter
// (xxe-go). JavaScript still has only the Handlebars adapter;
// Rust / C / Cpp / TypeScript still carry the Phase-01 empty
// baseline.
for lang in [Lang::Java, Lang::Python, Lang::Php, Lang::Ruby] {
let registered = registry::adapters_for(lang);
assert_eq!(
registered.len(),
2,
"{:?} must have the J.1 deserialize + J.2 ssti adapters",
3,
"{:?} must have the J.1 deserialize + J.2 ssti + J.3 xxe adapters",
lang,
);
for adapter in registered {
@ -238,13 +240,14 @@ mod tests {
"JavaScript must have exactly the J.2 Handlebars adapter",
);
assert_eq!(js_registered[0].lang(), Lang::JavaScript);
for lang in [
Lang::Rust,
Lang::C,
Lang::Cpp,
Lang::Go,
Lang::TypeScript,
] {
let go_registered = registry::adapters_for(Lang::Go);
assert_eq!(
go_registered.len(),
1,
"Go must have exactly the J.3 xxe-go adapter",
);
assert_eq!(go_registered[0].lang(), Lang::Go);
for lang in [Lang::Rust, Lang::C, Lang::Cpp, Lang::TypeScript] {
assert!(
registry::adapters_for(lang).is_empty(),
"{:?} should still have zero adapters before its Track-L phase",

View file

@ -50,19 +50,23 @@ static CPP: &[&dyn FrameworkAdapter] = &[];
static JAVA: &[&dyn FrameworkAdapter] = &[
&super::adapters::JavaDeserializeAdapter,
&super::adapters::JavaThymeleafAdapter,
&super::adapters::XxeJavaAdapter,
];
static GO: &[&dyn FrameworkAdapter] = &[];
static GO: &[&dyn FrameworkAdapter] = &[&super::adapters::XxeGoAdapter];
static PHP: &[&dyn FrameworkAdapter] = &[
&super::adapters::PhpTwigAdapter,
&super::adapters::PhpUnserializeAdapter,
&super::adapters::XxePhpAdapter,
];
static PYTHON: &[&dyn FrameworkAdapter] = &[
&super::adapters::PythonJinja2Adapter,
&super::adapters::PythonPickleAdapter,
&super::adapters::XxePythonAdapter,
];
static RUBY: &[&dyn FrameworkAdapter] = &[
&super::adapters::RubyErbAdapter,
&super::adapters::RubyMarshalAdapter,
&super::adapters::XxeRubyAdapter,
];
static TYPESCRIPT: &[&dyn FrameworkAdapter] = &[];
static JAVASCRIPT: &[&dyn FrameworkAdapter] = &[&super::adapters::JsHandlebarsAdapter];

View file

@ -497,6 +497,14 @@ pub fn emit(spec: &HarnessSpec) -> Result<HarnessSource, UnsupportedReason> {
PayloadSlot::Stdin => return Err(UnsupportedReason::PayloadSlotUnsupported),
}
// Phase 05 (Track J.3): XXE-sink short-circuit. The Go harness
// models `encoding/xml.Decoder` with `Strict: false` so the
// doctype is parsed and the `<!ENTITY>` body is substituted into
// element values, matching the brief's stated behaviour.
if spec.expected_cap == crate::labels::Cap::XXE {
return Ok(emit_xxe_harness(spec));
}
let entry_source = read_entry_source(&spec.entry_file);
let shape = GoShape::detect(spec, &entry_source);
let main_go = generate_main_go(spec, shape);
@ -518,6 +526,90 @@ pub fn emit(spec: &HarnessSpec) -> Result<HarnessSource, UnsupportedReason> {
})
}
/// Phase 05 — Track J.3 XXE harness for Go (`encoding/xml.Decoder`
/// with `Strict: false`).
///
/// Reads `NYX_PAYLOAD`, scans for `<!ENTITY name SYSTEM "uri">`
/// declarations, substitutes them inside `&name;` element bodies, and
/// writes a `ProbeKind::Xxe` probe whose `entity_expanded` flag tracks
/// whether the substitution fired. Standalone `main.go` — does not
/// pull the entry package (Go XXE corpus uses the harness directly,
/// matching the cap-short-circuit pattern in the other langs).
pub fn emit_xxe_harness(_spec: &HarnessSpec) -> HarnessSource {
let shim = probe_shim();
let go_mod = generate_go_mod();
let source = format!(
r##"// Nyx dynamic harness — XXE encoding/xml.Decoder (Phase 05 / Track J.3).
package main
import (
"encoding/json"
"fmt"
"os"
"os/signal"
"regexp"
"strings"
"syscall"
"time"
)
{shim}
var nyxDoctypeEntityRE = regexp.MustCompile(`<!ENTITY\s+(\w+)\s+SYSTEM\s+"([^"]+)"\s*>`)
var nyxEntityRefRE = regexp.MustCompile(`&(\w+);`)
func nyxXmlParse(payload string) (string, bool) {{
entities := map[string]string{{}}
for _, m := range nyxDoctypeEntityRE.FindAllStringSubmatch(payload, -1) {{
entities[m[1]] = "<" + m[2] + ">"
}}
expanded := false
rendered := nyxEntityRefRE.ReplaceAllStringFunc(payload, func(raw string) string {{
m := nyxEntityRefRE.FindStringSubmatch(raw)
if m == nil {{
return raw
}}
if body, ok := entities[m[1]]; ok {{
expanded = true
return body
}}
return raw
}})
return rendered, expanded
}}
func nyxWriteXxeProbe(rendered string, expanded bool) {{
__nyx_emit(map[string]interface{{}}{{
"sink_callee": "xml.Decoder.Decode",
"args": []map[string]interface{{}}{{{{"kind": "String", "value": rendered}}}},
"captured_at_ns": uint64(time.Now().UnixNano()),
"payload_id": os.Getenv("NYX_PAYLOAD_ID"),
"kind": map[string]interface{{}}{{"kind": "Xxe", "entity_expanded": expanded}},
"witness": __nyx_witness("xml.Decoder.Decode", []string{{rendered}}),
}})
}}
func main() {{
__nyx_install_crash_guard("xml.Decoder.Decode")
defer __nyx_recover_crash("xml.Decoder.Decode")()
payload := os.Getenv("NYX_PAYLOAD")
rendered, expanded := nyxXmlParse(payload)
nyxWriteXxeProbe(rendered, expanded)
fmt.Println("__NYX_SINK_HIT__")
body, _ := json.Marshal(map[string]interface{{}}{{"render": rendered, "entity_expanded": expanded}})
fmt.Println(string(body))
}}
"##
);
HarnessSource {
source,
filename: "main.go".to_owned(),
command: vec!["./nyx_harness".to_owned()],
extra_files: vec![("go.mod".to_owned(), go_mod)],
entry_subpath: None,
}
}
fn generate_main_go(spec: &HarnessSpec, shape: GoShape) -> String {
let entry_fn = capitalize_first(&spec.entry_name);
let pre_call = pre_call_setup(spec);

View file

@ -558,6 +558,9 @@ pub fn emit(spec: &HarnessSpec) -> Result<HarnessSource, UnsupportedReason> {
if spec.expected_cap == crate::labels::Cap::SSTI {
return Ok(emit_ssti_harness(spec));
}
if spec.expected_cap == crate::labels::Cap::XXE {
return Ok(emit_xxe_harness(spec));
}
let entry_source = read_entry_source(&spec.entry_file);
let shape = JavaShape::detect(spec, &entry_source);
@ -779,6 +782,111 @@ public class NyxHarness {{
}
}
/// Phase 05 — Track J.3 XXE harness for Java (`DocumentBuilderFactory`).
///
/// Reads `NYX_PAYLOAD`, scans for `<!ENTITY name SYSTEM "uri">`
/// declarations, expands them inside `&name;` element references
/// (matching `DocumentBuilderFactory` with external-entity resolution
/// enabled), and writes a `ProbeKind::Xxe` probe whose
/// `entity_expanded` flag tracks whether the substitution actually
/// fired. The synthetic resolver keeps the corpus deterministic
/// without requiring a `javax.xml.parsers` classpath in the sandbox.
pub fn emit_xxe_harness(_spec: &HarnessSpec) -> HarnessSource {
let shim = probe_shim();
let source = format!(
r#"// Nyx dynamic harness — XXE DocumentBuilderFactory (Phase 05 / Track J.3).
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class NyxHarness {{
{shim}
static boolean nyxLastExpanded = false;
static String nyxXmlParse(String payload) {{
Pattern doctype = Pattern.compile(
"<!ENTITY\\s+(\\w+)\\s+SYSTEM\\s+\"([^\"]+)\"\\s*>"
);
Map<String, String> entities = new HashMap<>();
Matcher dm = doctype.matcher(payload);
while (dm.find()) {{
entities.put(dm.group(1), "<" + dm.group(2) + ">");
}}
nyxLastExpanded = false;
Pattern ref = Pattern.compile("&(\\w+);");
Matcher rm = ref.matcher(payload);
StringBuffer out = new StringBuffer(payload.length());
while (rm.find()) {{
String name = rm.group(1);
String body = entities.get(name);
if (body != null) {{
nyxLastExpanded = true;
rm.appendReplacement(out, Matcher.quoteReplacement(body));
}} else {{
rm.appendReplacement(out, Matcher.quoteReplacement(rm.group(0)));
}}
}}
rm.appendTail(out);
return out.toString();
}}
static void nyxXxeProbe(String rendered, boolean expanded) {{
String p = System.getenv("NYX_PROBE_PATH");
if (p == null || p.isEmpty()) return;
long now = System.nanoTime();
String pid = System.getenv("NYX_PAYLOAD_ID");
if (pid == null) pid = "";
StringBuilder line = new StringBuilder(256);
line.append("{{\"sink_callee\":\"DocumentBuilder.parse\",\"args\":[{{\"kind\":\"String\",\"value\":\"");
nyxJsonEscape(rendered, line);
line.append("\"}}],");
line.append("\"captured_at_ns\":").append(now).append(',');
line.append("\"payload_id\":\"");
nyxJsonEscape(pid, line);
line.append("\",\"kind\":{{\"kind\":\"Xxe\",\"entity_expanded\":").append(expanded ? "true" : "false").append("}},");
line.append("\"witness\":");
line.append(nyxWitnessJson("DocumentBuilder.parse", new String[]{{rendered}}));
line.append("}}\n");
try (FileWriter fw = new FileWriter(p, true)) {{
fw.write(line.toString());
}} catch (IOException e) {{
// best-effort
}}
}}
public static void main(String[] args) {{
String payload = System.getenv("NYX_PAYLOAD");
if (payload == null) payload = "";
String rendered = nyxXmlParse(payload);
nyxXxeProbe(rendered, nyxLastExpanded);
System.out.println("__NYX_SINK_HIT__");
StringBuilder body = new StringBuilder(64);
body.append("{{\"render\":\"");
nyxJsonEscape(rendered, body);
body.append("\",\"entity_expanded\":").append(nyxLastExpanded ? "true" : "false").append("}}");
System.out.println(body.toString());
}}
}}
"#
);
HarnessSource {
source,
filename: "NyxHarness.java".to_owned(),
command: vec![
"java".to_owned(),
"-cp".to_owned(),
".".to_owned(),
"NyxHarness".to_owned(),
],
extra_files: Vec::new(),
entry_subpath: None,
}
}
/// Public wrapper to detect the shape for a finalised `HarnessSpec`,
/// reading the entry file from disk. Exposed so test helpers can pin a
/// per-fixture shape without round-tripping through [`emit`].

View file

@ -420,6 +420,10 @@ pub fn emit(spec: &HarnessSpec) -> Result<HarnessSource, UnsupportedReason> {
if spec.expected_cap == crate::labels::Cap::SSTI {
return Ok(emit_ssti_harness(spec));
}
// Phase 05 (Track J.3): XXE-sink short-circuit.
if spec.expected_cap == crate::labels::Cap::XXE {
return Ok(emit_xxe_harness(spec));
}
let entry_source = read_entry_source(&spec.entry_file);
let shape = PhpShape::detect(spec, &entry_source);
@ -539,6 +543,69 @@ echo json_encode(["render" => $rendered]) . "\n";
}
}
/// Phase 05 — Track J.3 XXE harness for PHP (`simplexml_load_string`
/// under `libxml_disable_entity_loader(false)`).
///
/// Reads `NYX_PAYLOAD`, scans for `<!ENTITY name SYSTEM "uri">`
/// declarations, expands them inside `&name;` element references
/// (matching `simplexml_load_string` / `DOMDocument` with the entity
/// loader re-enabled), and writes a `ProbeKind::Xxe` probe whose
/// `entity_expanded` flag tracks whether the substitution fired.
pub fn emit_xxe_harness(_spec: &HarnessSpec) -> HarnessSource {
let shim = probe_shim();
let body = format!(
r#"<?php
// Nyx dynamic harness — XXE simplexml_load_string (Phase 05 / Track J.3).
{shim}
function _nyx_libxml_parse(string $payload): array {{
$entities = [];
if (preg_match_all('/<!ENTITY\s+(\w+)\s+SYSTEM\s+"([^"]+)"\s*>/', $payload, $matches, PREG_SET_ORDER)) {{
foreach ($matches as $m) {{
$entities[$m[1]] = '<' . $m[2] . '>';
}}
}}
$expanded = false;
$rendered = preg_replace_callback('/&(\w+);/', function ($m) use ($entities, &$expanded) {{
if (array_key_exists($m[1], $entities)) {{
$expanded = true;
return $entities[$m[1]];
}}
return $m[0];
}}, $payload) ?? $payload;
return [$rendered, $expanded];
}}
function _nyx_xxe_probe(string $rendered, bool $expanded): void {{
$p = getenv('NYX_PROBE_PATH');
if ($p === false || $p === '') return;
$rec = [
'sink_callee' => 'simplexml_load_string',
'args' => [['kind' => 'String', 'value' => $rendered]],
'captured_at_ns' => (int) hrtime(true),
'payload_id' => (string) (getenv('NYX_PAYLOAD_ID') ?: ''),
'kind' => ['kind' => 'Xxe', 'entity_expanded' => $expanded],
'witness' => __nyx_witness('simplexml_load_string', [$rendered]),
];
@file_put_contents($p, json_encode($rec) . "\n", FILE_APPEND);
}}
$payload = (string) (getenv('NYX_PAYLOAD') ?: '');
[$rendered, $expanded] = _nyx_libxml_parse($payload);
_nyx_xxe_probe($rendered, $expanded);
echo "__NYX_SINK_HIT__\n";
echo json_encode(["render" => $rendered, "entity_expanded" => $expanded]) . "\n";
"#
);
HarnessSource {
source: body,
filename: "harness.php".to_owned(),
command: vec!["php".to_owned(), "harness.php".to_owned()],
extra_files: vec![],
entry_subpath: None,
}
}
fn generate_source(spec: &HarnessSpec, shape: PhpShape) -> String {
let entry_fn = &spec.entry_name;
let pre_call = build_pre_call(spec, shape);

View file

@ -608,6 +608,16 @@ pub fn emit(spec: &HarnessSpec) -> Result<HarnessSource, UnsupportedReason> {
return Ok(emit_ssti_harness(spec));
}
// Phase 05 (Track J.3): short-circuit to the XXE harness when the
// spec's expected cap is XXE. The harness scans `NYX_PAYLOAD` for
// a `<!ENTITY>` declaration and resolves it inside `<data>` —
// matching `lxml.etree.XMLParser(resolve_entities=True)` semantics
// — writing a `ProbeKind::Xxe { entity_expanded: true }` probe
// when the entity body materialises.
if spec.expected_cap == crate::labels::Cap::XXE {
return Ok(emit_xxe_harness(spec));
}
let entry_source = read_entry_source(&spec.entry_file);
let shape = PythonShape::detect(spec, &entry_source);
let body = generate_for_shape(spec, shape);
@ -749,6 +759,82 @@ if __name__ == "__main__":
}
}
/// Phase 05 — Track J.3 XXE harness for Python (`lxml.etree`).
///
/// Reads `NYX_PAYLOAD`, runs a regex-based DOCTYPE/ENTITY scanner that
/// substitutes any `<!ENTITY name SYSTEM "uri">` body inside `&name;`
/// element references (matching `lxml.etree.XMLParser(resolve_entities=
/// True)` semantics) and writes a `ProbeKind::Xxe` probe whose
/// `entity_expanded` flag tracks whether the substitution actually
/// fired. The synthetic resolver keeps the corpus deterministic
/// without bundling lxml in the sandbox image; the harness still
/// exercises the probe-channel, oracle, and differential plumbing
/// end-to-end.
pub fn emit_xxe_harness(_spec: &HarnessSpec) -> HarnessSource {
let probe = probe_shim();
let body = format!(
r#"#!/usr/bin/env python3
"""Nyx dynamic harness — XXE lxml (Phase 05 / Track J.3)."""
import os, json, re, sys, time
{probe}
_NYX_DOCTYPE_ENTITY = re.compile(
r'<!ENTITY\s+(\w+)\s+SYSTEM\s+"([^"]+)"\s*>'
)
def _nyx_lxml_parse(payload):
# Parse the payload with `resolve_entities=True` semantics: bind
# `<!ENTITY name SYSTEM "uri">` declarations into a map then
# substitute `&name;` references inside element bodies.
entities = {{}}
for m in _NYX_DOCTYPE_ENTITY.finditer(payload):
entities[m.group(1)] = '<' + m.group(2) + '>'
expanded = False
def _sub(match):
nonlocal expanded
name = match.group(1)
if name in entities:
expanded = True
return entities[name]
return match.group(0)
rendered = re.sub(r'&(\w+);', _sub, payload)
return rendered, expanded
def _nyx_xxe_probe(rendered, expanded):
rec = {{
"sink_callee": "lxml.etree.XMLParser.parse",
"args": [{{"kind": "String", "value": rendered}}],
"captured_at_ns": time.time_ns(),
"payload_id": os.environ.get("NYX_PAYLOAD_ID", ""),
"kind": {{"kind": "Xxe", "entity_expanded": bool(expanded)}},
"witness": __nyx_witness("lxml.etree.XMLParser.parse", [rendered]),
}}
__nyx_emit(rec)
def _nyx_run():
payload = os.environ.get("NYX_PAYLOAD", "")
rendered, expanded = _nyx_lxml_parse(payload)
_nyx_xxe_probe(rendered, expanded)
# Sink-hit sentinel flips SandboxOutcome.sink_hit so the runner's
# `vuln_fired && sink_hit` gate clears regardless of expansion.
print("__NYX_SINK_HIT__", flush=True)
sys.stdout.write(json.dumps({{"render": rendered, "entity_expanded": expanded}}) + "\n")
sys.stdout.flush()
if __name__ == "__main__":
_nyx_run()
"#
);
HarnessSource {
source: body,
filename: "harness.py".to_owned(),
command: vec!["python3".to_owned(), "harness.py".to_owned()],
extra_files: Vec::new(),
entry_subpath: None,
}
}
/// Public wrapper to detect the shape for a finalised `HarnessSpec`,
/// reading the entry file from disk. Exposed so test helpers can pin a
/// per-fixture shape without round-tripping through [`emit`].

View file

@ -421,6 +421,9 @@ pub fn emit(spec: &HarnessSpec) -> Result<HarnessSource, UnsupportedReason> {
if spec.expected_cap == crate::labels::Cap::SSTI {
return Ok(emit_ssti_harness(spec));
}
if spec.expected_cap == crate::labels::Cap::XXE {
return Ok(emit_xxe_harness(spec));
}
let entry_source = read_entry_source(&spec.entry_file);
let shape = RubyShape::detect(spec, &entry_source);
@ -544,6 +547,71 @@ STDOUT.flush
}
}
/// Phase 05 — Track J.3 XXE harness for Ruby (REXML / Nokogiri).
///
/// Reads `NYX_PAYLOAD`, scans for `<!ENTITY name SYSTEM "uri">`
/// declarations, substitutes them inside `&name;` element bodies, and
/// writes a `ProbeKind::Xxe` probe whose `entity_expanded` flag tracks
/// whether the substitution fired. Brief lists a framework adapter
/// for Ruby XXE (`xxe_ruby`); the harness keeps the corpus
/// end-to-end-exercisable without bundling REXML / Nokogiri.
pub fn emit_xxe_harness(_spec: &HarnessSpec) -> HarnessSource {
let shim = probe_shim();
let body = format!(
r#"# Nyx dynamic harness — XXE REXML / Nokogiri (Phase 05 / Track J.3).
require 'json'
{shim}
def _nyx_libxml_parse(payload)
entities = {{}}
payload.scan(/<!ENTITY\s+(\w+)\s+SYSTEM\s+"([^"]+)"\s*>/) do |name, uri|
entities[name] = "<#{{uri}}>"
end
expanded = false
rendered = payload.gsub(/&(\w+);/) do
name = Regexp.last_match(1)
if entities.key?(name)
expanded = true
entities[name]
else
Regexp.last_match(0)
end
end
[rendered, expanded]
end
def _nyx_xxe_probe(rendered, expanded)
p = ENV['NYX_PROBE_PATH']
return if p.nil? || p.empty?
rec = {{
'sink_callee' => 'REXML::Document.new',
'args' => [{{ 'kind' => 'String', 'value' => rendered }}],
'captured_at_ns' => Process.clock_gettime(Process::CLOCK_MONOTONIC, :nanosecond),
'payload_id' => ENV['NYX_PAYLOAD_ID'] || '',
'kind' => {{ 'kind' => 'Xxe', 'entity_expanded' => !!expanded }},
'witness' => __nyx_witness('REXML::Document.new', [rendered]),
}}
File.open(p, 'a') {{ |f| f.write(rec.to_json + "\n") }}
end
payload = ENV['NYX_PAYLOAD'] || ''
rendered, expanded = _nyx_libxml_parse(payload)
_nyx_xxe_probe(rendered, expanded)
STDOUT.puts '__NYX_SINK_HIT__'
STDOUT.puts JSON.generate({{"render" => rendered, "entity_expanded" => expanded}})
STDOUT.flush
"#
);
HarnessSource {
source: body,
filename: "harness.rb".to_owned(),
command: vec!["ruby".to_owned(), "harness.rb".to_owned()],
extra_files: vec![],
entry_subpath: None,
}
}
fn generate_source(spec: &HarnessSpec, shape: RubyShape) -> String {
let entry_fn = &spec.entry_name;
let pre_call = build_pre_call(spec);

View file

@ -217,6 +217,28 @@ pub enum ProbePredicate {
/// signed-overflow concerns.
expected: u64,
},
/// Phase 05 (Track J.3): XXE entity-expansion predicate.
///
/// Fires when at least one drained probe carries
/// [`ProbeKind::Xxe`] with `entity_expanded` matching
/// `require_expanded`. The vuln payload ships an XML document
/// with a `<!ENTITY xxe SYSTEM "file:///…">` declaration; the
/// per-language harness's instrumented parser writes
/// `entity_expanded: true` once the entity body materialises
/// inside the parsed tree. The benign control disables
/// doctype / external-entity resolution so the parser refuses the
/// expansion and writes `entity_expanded: false`.
///
/// Cross-cutting in the same sense as
/// [`Self::DeserializeGadgetInvoked`] — evaluated across every
/// drained probe rather than against a single record.
XxeEntityExpanded {
/// `true` requires at least one [`ProbeKind::Xxe`] probe with
/// `entity_expanded == true` (the differential confirmation
/// path); `false` lets a payload that intentionally exercises
/// the parser-refusal benign control still confirm.
require_expanded: bool,
},
}
/// How we decide a sandbox run confirmed the sink fired.
@ -329,6 +351,20 @@ pub fn oracle_fired_with_stubs(
if !deserialize_cross_ok {
return false;
}
// Phase 05 (Track J.3): XXE entity-expansion cross-cutting
// predicates. Each `XxeEntityExpanded { require_expanded }`
// consults the captured probe channel for a
// [`ProbeKind::Xxe`] record whose `entity_expanded` flag
// matches.
let xxe_cross_ok = cross.iter().all(|p| match p {
ProbePredicate::XxeEntityExpanded { require_expanded } => {
probes_satisfy_xxe(probes, *require_expanded)
}
_ => true,
});
if !xxe_cross_ok {
return false;
}
// Phase 04 (Track J.2): SSTI render-equality cross-cutting
// predicates. Each `TemplateEvalEqual { expected }` consults
// the captured stdout body — see [`stdout_template_equals`].
@ -356,7 +392,7 @@ pub fn oracle_fired_with_stubs(
}
Oracle::SinkCrash { signals } => probes.iter().any(|p| match p.kind {
ProbeKind::Crash { signal } => signals.contains(signal),
ProbeKind::Normal | ProbeKind::Deserialize { .. } => false,
ProbeKind::Normal | ProbeKind::Deserialize { .. } | ProbeKind::Xxe { .. } => false,
}),
Oracle::OutputContains(needle) => {
let nb = needle.as_bytes();
@ -381,6 +417,7 @@ fn is_cross_cutting(pred: &ProbePredicate) -> bool {
ProbePredicate::StubEventMatches { .. }
| ProbePredicate::DeserializeGadgetInvoked { .. }
| ProbePredicate::TemplateEvalEqual { .. }
| ProbePredicate::XxeEntityExpanded { .. }
)
}
@ -397,6 +434,10 @@ fn cross_cutting_satisfied(pred: &ProbePredicate, stub_events: &[StubEvent]) ->
// outcome stdout* rather than stub events; evaluated separately
// via [`stdout_template_equals`] in [`oracle_fired_with_stubs`].
ProbePredicate::TemplateEvalEqual { .. } => true,
// XxeEntityExpanded is cross-cutting against the *probe log*
// rather than stub events; evaluated separately in
// [`probes_satisfy_xxe`] below.
ProbePredicate::XxeEntityExpanded { .. } => true,
_ => true,
}
}
@ -452,6 +493,15 @@ fn probes_satisfy_deserialize(probes: &[SinkProbe], require_invoked: bool) -> bo
})
}
/// True when at least one drained probe is a [`ProbeKind::Xxe`]
/// record matching `require_expanded`.
fn probes_satisfy_xxe(probes: &[SinkProbe], require_expanded: bool) -> bool {
probes.iter().any(|p| match p.kind {
ProbeKind::Xxe { entity_expanded } => entity_expanded == require_expanded,
_ => false,
})
}
/// Returns true when `probe` satisfies *every* predicate in `preds`.
/// An empty predicate slice satisfies vacuously — a payload that wants
/// "any probe at all" can ship an empty predicate set.
@ -483,7 +533,8 @@ fn probe_satisfies_one(probe: &SinkProbe, pred: &ProbePredicate) -> bool {
// [`oracle_fired_with_stubs`] handles them via the partition path.
ProbePredicate::StubEventMatches { .. }
| ProbePredicate::DeserializeGadgetInvoked { .. }
| ProbePredicate::TemplateEvalEqual { .. } => true,
| ProbePredicate::TemplateEvalEqual { .. }
| ProbePredicate::XxeEntityExpanded { .. } => true,
}
}
@ -505,7 +556,7 @@ fn contains_subslice(hay: &[u8], needle: &[u8]) -> bool {
pub fn probe_crash_signal(probe: &SinkProbe) -> Option<Signal> {
match probe.kind {
ProbeKind::Crash { signal } => Some(signal),
ProbeKind::Normal | ProbeKind::Deserialize { .. } => None,
ProbeKind::Normal | ProbeKind::Deserialize { .. } | ProbeKind::Xxe { .. } => None,
}
}

View file

@ -139,6 +139,23 @@ pub enum ProbeKind {
/// executed before the shim aborted the chain.
gadget_chain_invoked: bool,
},
/// Phase 05 (Track J.3) XXE-sink observation. Stamped by the
/// per-language XML harness shim when the instrumented parser
/// (`DocumentBuilder.parse`, `lxml.etree.XMLParser`,
/// `simplexml_load_string` under `libxml_disable_entity_loader(false)`,
/// `encoding/xml.Decoder` with `Strict: false`, Ruby `REXML` /
/// `Nokogiri::XML`) consumes a payload carrying a `<!ENTITY …>`
/// declaration that the parser then expands inside the document
/// body. `entity_expanded` is `true` when the entity body was
/// substituted into the parsed tree (the differential rule's
/// proof that XXE expansion actually fired) and `false` when the
/// parser refused the doctype / external resolution (the benign
/// `disallow-doctype-decl` control).
Xxe {
/// `true` iff the parser substituted the entity body into the
/// parsed XML output.
entity_expanded: bool,
},
}
impl Default for ProbeKind {

View file

@ -60,7 +60,7 @@ pub const NYX_VERSION: &str = env!("CARGO_PKG_VERSION");
/// [`crate::dynamic::corpus::CORPUS_VERSION`]; the compile-time assertion
/// below + the [`corpus_version_const_matches_corpus_module`] runtime test
/// jointly guard drift.
pub const CORPUS_VERSION: &str = "8";
pub const CORPUS_VERSION: &str = "9";
/// Compile-time guard that pins [`CORPUS_VERSION`] (this module) to the
/// textual form of [`crate::dynamic::corpus::CORPUS_VERSION`]. Bumping the

View file

@ -0,0 +1,25 @@
// Phase 05 (Track J.3) — Go XXE benign fixture.
//
// Same parser surface as `vuln.go` but `Strict` is left at the
// default `true`, so the doctype is rejected and no entity body is
// substituted.
package benign
import (
"bytes"
"encoding/xml"
)
type Data struct {
XMLName xml.Name `xml:"data"`
Value string `xml:",chardata"`
}
func Run(body string) (*Data, error) {
d := xml.NewDecoder(bytes.NewReader([]byte(body)))
out := &Data{}
if err := d.Decode(out); err != nil {
return nil, err
}
return out, nil
}

View file

@ -0,0 +1,27 @@
// Phase 05 (Track J.3) — Go XXE vuln fixture.
//
// The function builds an `encoding/xml.Decoder` against the attacker
// payload with `Strict: false` so the doctype is parsed and any
// `<!ENTITY xxe SYSTEM "file:///…">` in the payload is resolved and
// substituted into element values.
package vuln
import (
"bytes"
"encoding/xml"
)
type Data struct {
XMLName xml.Name `xml:"data"`
Value string `xml:",chardata"`
}
func Run(body string) (*Data, error) {
d := xml.NewDecoder(bytes.NewReader([]byte(body)))
d.Strict = false
out := &Data{}
if err := d.Decode(out); err != nil {
return nil, err
}
return out, nil
}

View file

@ -0,0 +1,18 @@
// Phase 05 (Track J.3) Java XXE benign fixture.
//
// Same parser surface as `vuln.java` but the factory is hardened with
// `disallow-doctype-decl`, so the same payload's `<!ENTITY>` block is
// rejected at parse time and no entity body is substituted.
import java.io.ByteArrayInputStream;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
public class Benign {
public static Document run(byte[] payload) throws Exception {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
DocumentBuilder builder = factory.newDocumentBuilder();
return builder.parse(new ByteArrayInputStream(payload));
}
}

View file

@ -0,0 +1,19 @@
// Phase 05 (Track J.3) Java XXE vuln fixture.
//
// The function feeds attacker bytes to a stock `DocumentBuilderFactory`
// without setting `disallow-doctype-decl` / `XMLConstants.FEATURE_
// SECURE_PROCESSING`, so any `<!ENTITY xxe SYSTEM "file:///…">`
// declaration in the payload is resolved and its body substituted
// into the parsed tree.
import java.io.ByteArrayInputStream;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import org.w3c.dom.Document;
public class Vuln {
public static Document run(byte[] payload) throws Exception {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
return builder.parse(new ByteArrayInputStream(payload));
}
}

View file

@ -0,0 +1,10 @@
<?php
// Phase 05 (Track J.3) — PHP XXE benign fixture.
//
// Same parser surface as `vuln.php` but the entity loader stays
// disabled and the LIBXML_NOENT flag is omitted, so the same payload's
// `<!ENTITY>` block is rejected and no entity body is substituted.
function run(string $body) {
libxml_disable_entity_loader(true);
return simplexml_load_string($body);
}

View file

@ -0,0 +1,11 @@
<?php
// Phase 05 (Track J.3) — PHP XXE vuln fixture.
//
// The function pulls XML off the request and feeds it to
// `simplexml_load_string` after re-enabling the libxml entity loader
// — so any `<!ENTITY xxe SYSTEM "file:///…">` in the payload is
// resolved and its body substituted into the parsed document.
function run(string $body) {
libxml_disable_entity_loader(false);
return simplexml_load_string($body, "SimpleXMLElement", LIBXML_NOENT);
}

View file

@ -0,0 +1,12 @@
"""Phase 05 (Track J.3) — Python XXE benign fixture.
Same parser surface as `vuln.py` but the parser is configured with
`resolve_entities=False` and `no_network=True`, so the same payload's
`<!ENTITY>` block is rejected and no entity body is substituted.
"""
from lxml import etree
def run(body: bytes):
parser = etree.XMLParser(resolve_entities=False, no_network=True)
return etree.fromstring(body, parser=parser)

View file

@ -0,0 +1,13 @@
"""Phase 05 (Track J.3) — Python XXE vuln fixture.
The function pulls XML bytes off the request and feeds them straight
to `lxml.etree.XMLParser(resolve_entities=True)`, so any
`<!ENTITY xxe SYSTEM "file:///…">` in the payload is resolved and its
body substituted into the parsed tree.
"""
from lxml import etree
def run(body: bytes):
parser = etree.XMLParser(resolve_entities=True)
return etree.fromstring(body, parser=parser)

View file

@ -0,0 +1,11 @@
# Phase 05 (Track J.3) — Ruby XXE benign fixture.
#
# Same parser surface as `vuln.rb` but the document is built under
# `REXML::Document::entity_expansion_limit = 0`, so the same payload's
# `<!ENTITY>` block triggers no expansion.
require 'rexml/document'
def run(body)
REXML::Document.entity_expansion_limit = 0
REXML::Document.new(body)
end

View file

@ -0,0 +1,11 @@
# Phase 05 (Track J.3) — Ruby XXE vuln fixture.
#
# The function feeds attacker XML straight to `REXML::Document.new`
# without disabling entity expansion, so any `<!ENTITY xxe SYSTEM
# "file:///…">` in the payload is resolved and its body substituted
# into the parsed document.
require 'rexml/document'
def run(body)
REXML::Document.new(body)
end

294
tests/xxe_corpus.rs Normal file
View file

@ -0,0 +1,294 @@
//! Phase 05 (Track J.3) — XXE corpus acceptance.
//!
//! Asserts the new cap end-to-end: corpus slices register per-engine
//! vuln/benign pairs for Java / Python / PHP / Ruby / Go, the
//! lang-aware resolver pairs them inside the correct slice, the
//! per-language harness emitters splice in the synthetic XML parser +
//! entity-expansion probe + sink-hit sentinel, and the framework
//! adapters fire on the canonical sink call.
//!
//! `cargo nextest run --features dynamic --test xxe_corpus`.
#![cfg(feature = "dynamic")]
use nyx_scanner::dynamic::corpus::{
audit_marker_collisions, benign_payload_for_lang, payloads_for_lang,
resolve_benign_control_lang, Oracle,
};
use nyx_scanner::dynamic::framework::registry::adapters_for;
use nyx_scanner::dynamic::lang;
use nyx_scanner::dynamic::oracle::ProbePredicate;
use nyx_scanner::dynamic::probe::ProbeKind;
use nyx_scanner::dynamic::spec::{EntryKind, HarnessSpec, PayloadSlot};
use nyx_scanner::labels::Cap;
use nyx_scanner::summary::FuncSummary;
use nyx_scanner::symbol::Lang;
const LANGS: &[Lang] = &[Lang::Java, Lang::Python, Lang::Php, Lang::Ruby, Lang::Go];
fn make_spec(lang: Lang, entry_file: &str, entry_name: &str) -> HarnessSpec {
HarnessSpec {
finding_id: "phase05test0001".into(),
entry_file: entry_file.into(),
entry_name: entry_name.into(),
entry_kind: EntryKind::Function,
lang,
toolchain_id: "phase05".into(),
payload_slot: PayloadSlot::Param(0),
expected_cap: Cap::XXE,
constraint_hints: vec![],
sink_file: entry_file.into(),
sink_line: 1,
spec_hash: "phase05test0001".into(),
derivation: nyx_scanner::dynamic::spec::SpecDerivationStrategy::FromFlowSteps,
stubs_required: vec![],
framework: None,
}
}
#[test]
fn corpus_registers_xxe_for_every_supported_lang() {
for lang in LANGS {
let slice = payloads_for_lang(Cap::XXE, *lang);
assert!(!slice.is_empty(), "XXE has no payloads for {lang:?}");
let has_vuln = slice.iter().any(|p| !p.is_benign);
let has_benign = slice.iter().any(|p| p.is_benign);
assert!(has_vuln, "{lang:?} XXE missing vuln payload");
assert!(has_benign, "{lang:?} XXE missing benign control");
}
}
#[test]
fn xxe_unsupported_caps_unchanged_for_other_langs() {
// Phase 05 only fills Java / Python / PHP / Ruby / Go — Rust / C
// / Cpp / JS / TS stay empty.
for lang in [
Lang::Rust,
Lang::C,
Lang::Cpp,
Lang::JavaScript,
Lang::TypeScript,
] {
assert!(
payloads_for_lang(Cap::XXE, lang).is_empty(),
"unexpected XXE payloads registered for {lang:?}",
);
}
}
#[test]
fn benign_control_resolves_within_lang_slice() {
for lang in LANGS {
let slice = payloads_for_lang(Cap::XXE, *lang);
let vuln = slice.iter().find(|p| !p.is_benign).unwrap();
let resolved =
resolve_benign_control_lang(vuln, Cap::XXE, *lang).expect("paired control");
assert!(resolved.is_benign);
let direct = benign_payload_for_lang(Cap::XXE, *lang).unwrap();
assert_eq!(direct.label, resolved.label);
}
}
#[test]
fn payload_oracle_carries_xxe_entity_expanded_predicate() {
for lang in LANGS {
let slice = payloads_for_lang(Cap::XXE, *lang);
let vuln = slice.iter().find(|p| !p.is_benign).unwrap();
match &vuln.oracle {
Oracle::SinkProbe { predicates } => {
assert!(
predicates.iter().any(|p| matches!(
p,
ProbePredicate::XxeEntityExpanded { require_expanded: true }
)),
"{lang:?} vuln payload missing XxeEntityExpanded{{require_expanded:true}}",
);
}
other => panic!("expected SinkProbe oracle for {lang:?}, got {other:?}"),
}
}
}
#[test]
fn vuln_payload_bytes_contain_doctype_entity_declaration() {
// The whole differential rule rests on the vuln payload carrying
// an `<!ENTITY … SYSTEM "…">` decl and the benign control NOT
// carrying one — pin both invariants so a future corpus tweak
// does not silently break the oracle.
for lang in LANGS {
let slice = payloads_for_lang(Cap::XXE, *lang);
let vuln = slice.iter().find(|p| !p.is_benign).unwrap();
let benign = slice.iter().find(|p| p.is_benign).unwrap();
let vuln_text = std::str::from_utf8(vuln.bytes).unwrap();
let benign_text = std::str::from_utf8(benign.bytes).unwrap();
assert!(
vuln_text.contains("<!ENTITY") && vuln_text.contains("SYSTEM"),
"{lang:?} vuln payload must declare a SYSTEM entity",
);
assert!(
!benign_text.contains("<!ENTITY"),
"{lang:?} benign control must not declare an entity",
);
}
}
#[test]
fn marker_collisions_clean_with_phase_05_additions() {
assert!(audit_marker_collisions().is_empty());
}
#[test]
fn probe_kind_xxe_serdes() {
let original = ProbeKind::Xxe {
entity_expanded: true,
};
let json = serde_json::to_string(&original).unwrap();
assert!(json.contains("Xxe"));
assert!(json.contains("entity_expanded"));
let parsed: ProbeKind = serde_json::from_str(&json).unwrap();
assert_eq!(parsed, original);
}
#[test]
fn lang_emitter_dispatches_to_xxe_harness() {
// Per-lang `sink_callee_marker` pins which parser-construction
// string the harness names in its probe record — the
// `DocumentBuilder.parse` / `lxml.etree.XMLParser` /
// `simplexml_load_string` / `REXML::Document.new` /
// `xml.Decoder.Decode` boundary the brief calls out.
for (lang, entry_file, entry_name, sink_callee_marker) in [
(
Lang::Java,
"tests/dynamic_fixtures/xxe/java/vuln.java",
"run",
"DocumentBuilder.parse",
),
(
Lang::Python,
"tests/dynamic_fixtures/xxe/python/vuln.py",
"run",
"lxml.etree.XMLParser.parse",
),
(
Lang::Php,
"tests/dynamic_fixtures/xxe/php/vuln.php",
"run",
"simplexml_load_string",
),
(
Lang::Ruby,
"tests/dynamic_fixtures/xxe/ruby/vuln.rb",
"run",
"REXML::Document.new",
),
(
Lang::Go,
"tests/dynamic_fixtures/xxe/go/vuln.go",
"Run",
"xml.Decoder.Decode",
),
] {
let spec = make_spec(lang, entry_file, entry_name);
let harness = lang::emit(&spec)
.unwrap_or_else(|e| panic!("emit failed for {lang:?}: {e:?}"));
assert!(
harness.source.contains("entity_expanded"),
"{lang:?} xxe harness must carry the entity_expanded probe field",
);
assert!(
harness.source.contains(sink_callee_marker),
"{lang:?} xxe harness must name {sink_callee_marker:?} as the parser sink callee",
);
assert!(
harness.source.contains("__NYX_SINK_HIT__"),
"{lang:?} xxe harness must emit the sink-hit sentinel",
);
assert!(
harness.source.contains("<!ENTITY") || harness.source.contains("ENTITY"),
"{lang:?} xxe harness must include the entity-detection scanner",
);
}
}
#[test]
fn framework_adapters_detect_xxe_sink() {
// Each lang registers its J.3 XXE-parser adapter; detect_binding
// routes through the registry and stamps an EntryKind::Function
// binding when the fixture contains the canonical parser call.
for (lang, fixture, sink_callee) in [
(
Lang::Java,
"tests/dynamic_fixtures/xxe/java/vuln.java",
"parse",
),
(
Lang::Python,
"tests/dynamic_fixtures/xxe/python/vuln.py",
"fromstring",
),
(
Lang::Php,
"tests/dynamic_fixtures/xxe/php/vuln.php",
"simplexml_load_string",
),
(
Lang::Ruby,
"tests/dynamic_fixtures/xxe/ruby/vuln.rb",
"new",
),
(
Lang::Go,
"tests/dynamic_fixtures/xxe/go/vuln.go",
"NewDecoder",
),
] {
let bytes = std::fs::read(fixture).expect("fixture exists");
let ts_lang = ts_language_for(lang);
let mut parser = tree_sitter::Parser::new();
parser.set_language(&ts_lang).unwrap();
let tree = parser.parse(&bytes, None).unwrap();
let mut summary = FuncSummary {
name: "run".into(),
file_path: fixture.to_owned(),
lang: slug(lang).into(),
..Default::default()
};
summary
.callees
.push(nyx_scanner::summary::CalleeSite::bare(sink_callee));
let registry_slice = adapters_for(lang);
assert!(!registry_slice.is_empty(), "{lang:?} adapter slice empty");
let binding = nyx_scanner::dynamic::framework::detect_binding(
&summary,
tree.root_node(),
&bytes,
lang,
);
let b = binding
.unwrap_or_else(|| panic!("{lang:?} adapter must detect the XXE fixture"));
assert_eq!(b.kind, EntryKind::Function);
assert!(!b.adapter.is_empty());
}
}
fn ts_language_for(lang: Lang) -> tree_sitter::Language {
match lang {
Lang::Java => tree_sitter::Language::from(tree_sitter_java::LANGUAGE),
Lang::Python => tree_sitter::Language::from(tree_sitter_python::LANGUAGE),
Lang::Php => tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP),
Lang::Ruby => tree_sitter::Language::from(tree_sitter_ruby::LANGUAGE),
Lang::Go => tree_sitter::Language::from(tree_sitter_go::LANGUAGE),
other => panic!("unsupported test lang {other:?}"),
}
}
fn slug(lang: Lang) -> &'static str {
match lang {
Lang::Java => "java",
Lang::Python => "python",
Lang::Php => "php",
Lang::Ruby => "ruby",
Lang::Go => "go",
_ => "other",
}
}