From 4de925c3ef3d1cfb0168ca350fe99a74d3f0e698 Mon Sep 17 00:00:00 2001 From: pitboss Date: Sun, 17 May 2026 20:39:12 -0500 Subject: [PATCH] =?UTF-8?q?[pitboss]=20phase=2005:=20Track=20J.3=20+=20Tra?= =?UTF-8?q?ck=20L.3=20=E2=80=94=20`XXE`=20corpus=20+=20DocumentBuilder=20/?= =?UTF-8?q?=20lxml=20/=20libxml=20/=20SimpleXML=20adapters?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dynamic/corpus.rs | 4 +- src/dynamic/corpus/registry.rs | 54 +++- src/dynamic/corpus/xxe/go.rs | 66 +++++ src/dynamic/corpus/xxe/java.rs | 67 +++++ src/dynamic/corpus/xxe/mod.rs | 24 ++ src/dynamic/corpus/xxe/php.rs | 66 +++++ src/dynamic/corpus/xxe/python.rs | 66 +++++ src/dynamic/corpus/xxe/ruby.rs | 65 ++++ src/dynamic/framework/adapters/mod.rs | 10 + src/dynamic/framework/adapters/xxe_go.rs | 113 +++++++ src/dynamic/framework/adapters/xxe_java.rs | 139 +++++++++ src/dynamic/framework/adapters/xxe_php.rs | 120 ++++++++ src/dynamic/framework/adapters/xxe_python.rs | 120 ++++++++ src/dynamic/framework/adapters/xxe_ruby.rs | 109 +++++++ src/dynamic/framework/mod.rs | 31 +- src/dynamic/framework/registry.rs | 6 +- src/dynamic/lang/go.rs | 92 ++++++ src/dynamic/lang/java.rs | 108 +++++++ src/dynamic/lang/php.rs | 67 +++++ src/dynamic/lang/python.rs | 86 ++++++ src/dynamic/lang/ruby.rs | 68 +++++ src/dynamic/oracle.rs | 57 +++- src/dynamic/probe.rs | 17 ++ src/dynamic/telemetry.rs | 2 +- tests/dynamic_fixtures/xxe/go/benign.go | 25 ++ tests/dynamic_fixtures/xxe/go/vuln.go | 27 ++ tests/dynamic_fixtures/xxe/java/benign.java | 18 ++ tests/dynamic_fixtures/xxe/java/vuln.java | 19 ++ tests/dynamic_fixtures/xxe/php/benign.php | 10 + tests/dynamic_fixtures/xxe/php/vuln.php | 11 + tests/dynamic_fixtures/xxe/python/benign.py | 12 + tests/dynamic_fixtures/xxe/python/vuln.py | 13 + tests/dynamic_fixtures/xxe/ruby/benign.rb | 11 + tests/dynamic_fixtures/xxe/ruby/vuln.rb | 11 + tests/xxe_corpus.rs | 294 +++++++++++++++++++ 35 files changed, 1985 insertions(+), 23 deletions(-) create mode 100644 src/dynamic/corpus/xxe/go.rs create mode 100644 src/dynamic/corpus/xxe/java.rs create mode 100644 src/dynamic/corpus/xxe/mod.rs create mode 100644 src/dynamic/corpus/xxe/php.rs create mode 100644 src/dynamic/corpus/xxe/python.rs create mode 100644 src/dynamic/corpus/xxe/ruby.rs create mode 100644 src/dynamic/framework/adapters/xxe_go.rs create mode 100644 src/dynamic/framework/adapters/xxe_java.rs create mode 100644 src/dynamic/framework/adapters/xxe_php.rs create mode 100644 src/dynamic/framework/adapters/xxe_python.rs create mode 100644 src/dynamic/framework/adapters/xxe_ruby.rs create mode 100644 tests/dynamic_fixtures/xxe/go/benign.go create mode 100644 tests/dynamic_fixtures/xxe/go/vuln.go create mode 100644 tests/dynamic_fixtures/xxe/java/benign.java create mode 100644 tests/dynamic_fixtures/xxe/java/vuln.java create mode 100644 tests/dynamic_fixtures/xxe/php/benign.php create mode 100644 tests/dynamic_fixtures/xxe/php/vuln.php create mode 100644 tests/dynamic_fixtures/xxe/python/benign.py create mode 100644 tests/dynamic_fixtures/xxe/python/vuln.py create mode 100644 tests/dynamic_fixtures/xxe/ruby/benign.rb create mode 100644 tests/dynamic_fixtures/xxe/ruby/vuln.rb create mode 100644 tests/xxe_corpus.rs diff --git a/src/dynamic/corpus.rs b/src/dynamic/corpus.rs index 6ac257f3..e643c463 100644 --- a/src/dynamic/corpus.rs +++ b/src/dynamic/corpus.rs @@ -55,6 +55,7 @@ mod sqli; mod ssrf; mod ssti; mod xss; +mod xxe; pub use registry::{ audit_marker_collisions, benign_payload_for, benign_payload_for_lang, materialise_bytes, @@ -86,7 +87,8 @@ pub use crate::dynamic::oracle::Oracle; /// | 6 | 2026-05-17 | Phase 02 / Track J.0: `(Cap, Lang)` registry refactor; `no_benign_control_rationale` field; compile-time provenance audit | /// | 7 | 2026-05-17 | Phase 03 / Track J.1: `DESERIALIZE` cap lit for Java / Python / PHP / Ruby; `ProbeKind::Deserialize` + `ProbePredicate::DeserializeGadgetInvoked` | /// | 8 | 2026-05-17 | Phase 04 / Track J.2: `SSTI` cap lit for Jinja2 / ERB / Twig / Thymeleaf / Handlebars; `ProbePredicate::TemplateEvalEqual` | -pub const CORPUS_VERSION: u32 = 8; +/// | 9 | 2026-05-17 | Phase 05 / Track J.3: `XXE` cap lit for Java / Python / PHP / Ruby / Go; `ProbeKind::Xxe` + `ProbePredicate::XxeEntityExpanded` | +pub const CORPUS_VERSION: u32 = 9; /// Where a payload originated. #[derive(Debug, Clone, Copy, PartialEq, Eq)] diff --git a/src/dynamic/corpus/registry.rs b/src/dynamic/corpus/registry.rs index 6e379a65..d603ff41 100644 --- a/src/dynamic/corpus/registry.rs +++ b/src/dynamic/corpus/registry.rs @@ -23,7 +23,7 @@ use std::collections::HashMap; use std::sync::OnceLock; -use super::{cmdi, deserialize, fmt_string, path_trav, sqli, ssrf, ssti, xss}; +use super::{cmdi, deserialize, fmt_string, path_trav, sqli, ssrf, ssti, xss, xxe}; use super::{CapCorpus, CuratedPayload, Oracle}; use crate::dynamic::oracle::ProbePredicate; use crate::labels::Cap; @@ -44,7 +44,6 @@ pub const CORPUS_UNSUPPORTED_LANG_NEUTRAL: u32 = Cap::ENV_VAR.bits() | Cap::XPATH_INJECTION.bits() | Cap::HEADER_INJECTION.bits() | Cap::OPEN_REDIRECT.bits() - | Cap::XXE.bits() | Cap::PROTOTYPE_POLLUTION.bits(); /// Flat `(Cap, Lang, slice)` table. A single cap can carry per-language @@ -65,6 +64,11 @@ const ENTRIES: &[(Cap, Lang, &[CuratedPayload])] = &[ (Cap::SSTI, Lang::Php, ssti::php_twig::PAYLOADS), (Cap::SSTI, Lang::Java, ssti::java_thymeleaf::PAYLOADS), (Cap::SSTI, Lang::JavaScript, ssti::js_handlebars::PAYLOADS), + (Cap::XXE, Lang::Java, xxe::java::PAYLOADS), + (Cap::XXE, Lang::Python, xxe::python::PAYLOADS), + (Cap::XXE, Lang::Php, xxe::php::PAYLOADS), + (Cap::XXE, Lang::Ruby, xxe::ruby::PAYLOADS), + (Cap::XXE, Lang::Go, xxe::go::PAYLOADS), ]; /// Reserved for per-cap oracle defaults. Empty in Phase 02; populated by @@ -273,6 +277,7 @@ mod tests { assert!(!payloads_for(Cap::FMT_STRING).is_empty()); assert!(!payloads_for(Cap::DESERIALIZE).is_empty()); assert!(!payloads_for(Cap::SSTI).is_empty()); + assert!(!payloads_for(Cap::XXE).is_empty()); } #[test] @@ -289,7 +294,6 @@ mod tests { Cap::XPATH_INJECTION, Cap::HEADER_INJECTION, Cap::OPEN_REDIRECT, - Cap::XXE, Cap::PROTOTYPE_POLLUTION, ]; for cap in unsupported { @@ -320,6 +324,7 @@ mod tests { Cap::FMT_STRING, Cap::DESERIALIZE, Cap::SSTI, + Cap::XXE, ] { let has_vuln = payloads_for(cap).iter().any(|p| !p.is_benign); assert!(has_vuln, "{cap:?} must have at least one vuln payload"); @@ -368,6 +373,7 @@ mod tests { Cap::FMT_STRING, Cap::DESERIALIZE, Cap::SSTI, + Cap::XXE, ]; for cap in caps { for p in payloads_for(cap) { @@ -391,6 +397,7 @@ mod tests { Cap::FMT_STRING, Cap::DESERIALIZE, Cap::SSTI, + Cap::XXE, ]; for cap in caps { for p in payloads_for(cap) { @@ -501,6 +508,7 @@ mod tests { Cap::FMT_STRING, Cap::DESERIALIZE, Cap::SSTI, + Cap::XXE, ]; for cap in caps { for p in payloads_for(cap).iter().filter(|p| p.is_benign) { @@ -629,6 +637,46 @@ mod tests { } } + #[test] + fn xxe_has_per_lang_slices_for_phase_05() { + // Phase 05 (Track J.3) acceptance: XXE registers payloads in + // Java / Python / PHP / Ruby / Go and the lang-aware lookup + // never returns empty for any of them. + for lang in [Lang::Java, Lang::Python, Lang::Php, Lang::Ruby, Lang::Go] { + assert!( + !payloads_for_lang(Cap::XXE, lang).is_empty(), + "XXE must have at least one payload for {lang:?}", + ); + } + // Rust / C / Cpp / JS / TS not yet covered. + for lang in [ + Lang::Rust, + Lang::C, + Lang::Cpp, + Lang::JavaScript, + Lang::TypeScript, + ] { + assert!( + payloads_for_lang(Cap::XXE, lang).is_empty(), + "XXE has unexpected payloads for {lang:?}", + ); + } + } + + #[test] + fn xxe_payloads_pair_benign_controls_per_lang() { + for lang in [Lang::Java, Lang::Python, Lang::Php, Lang::Ruby, Lang::Go] { + let slice = payloads_for_lang(Cap::XXE, lang); + let vuln = slice + .iter() + .find(|p| !p.is_benign) + .expect("each lang must have an XXE vuln payload"); + let resolved = super::resolve_benign_control_lang(vuln, Cap::XXE, lang) + .expect("lang-aware benign control must resolve"); + assert!(resolved.is_benign); + } + } + #[test] fn deserialize_payloads_pair_benign_controls_per_lang() { // The lang-aware resolver must find the paired benign control diff --git a/src/dynamic/corpus/xxe/go.rs b/src/dynamic/corpus/xxe/go.rs new file mode 100644 index 00000000..da2201aa --- /dev/null +++ b/src/dynamic/corpus/xxe/go.rs @@ -0,0 +1,66 @@ +//! Go `Cap::XXE` payloads — `encoding/xml.Decoder` with `Strict: false`. +//! +//! Vuln payload: an XML document declaring an external entity that +//! the harness's instrumented `xml.Decoder` (running non-strict so +//! the doctype is parsed at all) expands inside ``; the shim +//! writes `ProbeKind::Xxe { entity_expanded: true }` once it sees the +//! entity body substitute into the decoded element value. +//! +//! Benign control: a well-formed XML document with no doctype, so the +//! decoder has no entity to resolve and the shim writes +//! `entity_expanded: false`. + +use super::super::{CuratedPayload, Oracle, PayloadProvenance, PayloadRef}; +use crate::dynamic::oracle::ProbePredicate; + +pub const PAYLOADS: &[CuratedPayload] = &[ + CuratedPayload { + bytes: br#" + +]> +&xxe;"#, + label: "xxe-go-doctype-entity", + oracle: Oracle::SinkProbe { + predicates: &[ProbePredicate::XxeEntityExpanded { + require_expanded: true, + }], + }, + is_benign: false, + provenance: PayloadProvenance::Curated, + since_corpus_version: 9, + deprecated_at_corpus_version: None, + fixture_paths: &[ + "tests/dynamic_fixtures/xxe/go/vuln.go", + ], + oob_nonce_slot: false, + probe_predicates: &[ProbePredicate::XxeEntityExpanded { + require_expanded: true, + }], + benign_control: Some(PayloadRef { + label: "xxe-go-benign", + }), + no_benign_control_rationale: None, + }, + CuratedPayload { + bytes: br#" +hello"#, + label: "xxe-go-benign", + oracle: Oracle::SinkProbe { + predicates: &[ProbePredicate::XxeEntityExpanded { + require_expanded: true, + }], + }, + is_benign: true, + provenance: PayloadProvenance::Curated, + since_corpus_version: 9, + deprecated_at_corpus_version: None, + fixture_paths: &[ + "tests/dynamic_fixtures/xxe/go/benign.go", + ], + oob_nonce_slot: false, + probe_predicates: &[], + benign_control: None, + no_benign_control_rationale: None, + }, +]; diff --git a/src/dynamic/corpus/xxe/java.rs b/src/dynamic/corpus/xxe/java.rs new file mode 100644 index 00000000..a04374e0 --- /dev/null +++ b/src/dynamic/corpus/xxe/java.rs @@ -0,0 +1,67 @@ +//! Java `Cap::XXE` payloads — `DocumentBuilderFactory` / `SAXParser`. +//! +//! Vuln payload: an XML document declaring an external entity that +//! the harness's instrumented `DocumentBuilder.parse` resolves and +//! substitutes inside `` — the parser writes a +//! `ProbeKind::Xxe { entity_expanded: true }` record once it sees the +//! entity body materialise. +//! +//! Benign control: a well-formed XML document with no doctype +//! declaration so the parser has no entity to resolve. The harness's +//! instrumented parser writes `entity_expanded: false`, the oracle +//! does not fire, and the differential rule (§4.1) stays clean. + +use super::super::{CuratedPayload, Oracle, PayloadProvenance, PayloadRef}; +use crate::dynamic::oracle::ProbePredicate; + +pub const PAYLOADS: &[CuratedPayload] = &[ + CuratedPayload { + bytes: br#" + +]> +&xxe;"#, + label: "xxe-java-doctype-entity", + oracle: Oracle::SinkProbe { + predicates: &[ProbePredicate::XxeEntityExpanded { + require_expanded: true, + }], + }, + is_benign: false, + provenance: PayloadProvenance::Curated, + since_corpus_version: 9, + deprecated_at_corpus_version: None, + fixture_paths: &[ + "tests/dynamic_fixtures/xxe/java/vuln.java", + ], + oob_nonce_slot: false, + probe_predicates: &[ProbePredicate::XxeEntityExpanded { + require_expanded: true, + }], + benign_control: Some(PayloadRef { + label: "xxe-java-benign", + }), + no_benign_control_rationale: None, + }, + CuratedPayload { + bytes: br#" +hello"#, + label: "xxe-java-benign", + oracle: Oracle::SinkProbe { + predicates: &[ProbePredicate::XxeEntityExpanded { + require_expanded: true, + }], + }, + is_benign: true, + provenance: PayloadProvenance::Curated, + since_corpus_version: 9, + deprecated_at_corpus_version: None, + fixture_paths: &[ + "tests/dynamic_fixtures/xxe/java/benign.java", + ], + oob_nonce_slot: false, + probe_predicates: &[], + benign_control: None, + no_benign_control_rationale: None, + }, +]; diff --git a/src/dynamic/corpus/xxe/mod.rs b/src/dynamic/corpus/xxe/mod.rs new file mode 100644 index 00000000..813d720e --- /dev/null +++ b/src/dynamic/corpus/xxe/mod.rs @@ -0,0 +1,24 @@ +//! XML External Entity expansion (`Cap::XXE`) per-language payload slices. +//! +//! Phase 05 (Track J.3) carves XXE across the five most-common XML +//! parser stacks: Java (`DocumentBuilderFactory`), Python +//! (`lxml.etree.XMLParser`), PHP (`simplexml_load_string` under +//! `libxml_disable_entity_loader(false)`), Ruby (REXML / Nokogiri), and +//! Go (`encoding/xml.Decoder`). Every vuln payload ships an XML +//! document declaring an external entity (``) +//! that the engine expands inside an element body. The paired benign +//! control omits the doctype + entity so the parser has nothing to +//! resolve; the oracle's +//! [`crate::dynamic::oracle::ProbePredicate::XxeEntityExpanded`] check +//! satisfies on the vuln run (`entity_expanded: true`) and stays clear +//! on the benign run, fulfilling the §4.1 differential rule. +//! +//! C# is intentionally omitted: the [`crate::symbol::Lang`] enum has +//! no `CSharp` variant, so the corpus has nowhere to register it. +//! Tracked in `.pitboss/play/deferred.md`. + +pub mod go; +pub mod java; +pub mod php; +pub mod python; +pub mod ruby; diff --git a/src/dynamic/corpus/xxe/php.rs b/src/dynamic/corpus/xxe/php.rs new file mode 100644 index 00000000..295345ee --- /dev/null +++ b/src/dynamic/corpus/xxe/php.rs @@ -0,0 +1,66 @@ +//! PHP `Cap::XXE` payloads — `simplexml_load_string` under +//! `libxml_disable_entity_loader(false)`. +//! +//! Vuln payload: an XML document declaring an external entity that +//! the harness's instrumented parser expands inside ``; the +//! shim writes `ProbeKind::Xxe { entity_expanded: true }` once it +//! sees the entity body substitute into the parsed output. +//! +//! Benign control: a well-formed XML document with no doctype, so +//! the parser has no entity to resolve and the shim writes +//! `entity_expanded: false`. + +use super::super::{CuratedPayload, Oracle, PayloadProvenance, PayloadRef}; +use crate::dynamic::oracle::ProbePredicate; + +pub const PAYLOADS: &[CuratedPayload] = &[ + CuratedPayload { + bytes: br#" + +]> +&xxe;"#, + label: "xxe-php-doctype-entity", + oracle: Oracle::SinkProbe { + predicates: &[ProbePredicate::XxeEntityExpanded { + require_expanded: true, + }], + }, + is_benign: false, + provenance: PayloadProvenance::Curated, + since_corpus_version: 9, + deprecated_at_corpus_version: None, + fixture_paths: &[ + "tests/dynamic_fixtures/xxe/php/vuln.php", + ], + oob_nonce_slot: false, + probe_predicates: &[ProbePredicate::XxeEntityExpanded { + require_expanded: true, + }], + benign_control: Some(PayloadRef { + label: "xxe-php-benign", + }), + no_benign_control_rationale: None, + }, + CuratedPayload { + bytes: br#" +hello"#, + label: "xxe-php-benign", + oracle: Oracle::SinkProbe { + predicates: &[ProbePredicate::XxeEntityExpanded { + require_expanded: true, + }], + }, + is_benign: true, + provenance: PayloadProvenance::Curated, + since_corpus_version: 9, + deprecated_at_corpus_version: None, + fixture_paths: &[ + "tests/dynamic_fixtures/xxe/php/benign.php", + ], + oob_nonce_slot: false, + probe_predicates: &[], + benign_control: None, + no_benign_control_rationale: None, + }, +]; diff --git a/src/dynamic/corpus/xxe/python.rs b/src/dynamic/corpus/xxe/python.rs new file mode 100644 index 00000000..88006ae1 --- /dev/null +++ b/src/dynamic/corpus/xxe/python.rs @@ -0,0 +1,66 @@ +//! Python `Cap::XXE` payloads — `lxml.etree.XMLParser(resolve_entities=True)`. +//! +//! Vuln payload: an XML document declaring an external entity that +//! the harness's instrumented parser (`resolve_entities=True`) +//! expands inside ``; the shim writes +//! `ProbeKind::Xxe { entity_expanded: true }` once it sees the entity +//! body substitute into the parsed tree. +//! +//! Benign control: a well-formed XML document with no doctype, so the +//! parser has nothing to resolve and the shim writes +//! `entity_expanded: false`. + +use super::super::{CuratedPayload, Oracle, PayloadProvenance, PayloadRef}; +use crate::dynamic::oracle::ProbePredicate; + +pub const PAYLOADS: &[CuratedPayload] = &[ + CuratedPayload { + bytes: br#" + +]> +&xxe;"#, + label: "xxe-python-doctype-entity", + oracle: Oracle::SinkProbe { + predicates: &[ProbePredicate::XxeEntityExpanded { + require_expanded: true, + }], + }, + is_benign: false, + provenance: PayloadProvenance::Curated, + since_corpus_version: 9, + deprecated_at_corpus_version: None, + fixture_paths: &[ + "tests/dynamic_fixtures/xxe/python/vuln.py", + ], + oob_nonce_slot: false, + probe_predicates: &[ProbePredicate::XxeEntityExpanded { + require_expanded: true, + }], + benign_control: Some(PayloadRef { + label: "xxe-python-benign", + }), + no_benign_control_rationale: None, + }, + CuratedPayload { + bytes: br#" +hello"#, + label: "xxe-python-benign", + oracle: Oracle::SinkProbe { + predicates: &[ProbePredicate::XxeEntityExpanded { + require_expanded: true, + }], + }, + is_benign: true, + provenance: PayloadProvenance::Curated, + since_corpus_version: 9, + deprecated_at_corpus_version: None, + fixture_paths: &[ + "tests/dynamic_fixtures/xxe/python/benign.py", + ], + oob_nonce_slot: false, + probe_predicates: &[], + benign_control: None, + no_benign_control_rationale: None, + }, +]; diff --git a/src/dynamic/corpus/xxe/ruby.rs b/src/dynamic/corpus/xxe/ruby.rs new file mode 100644 index 00000000..934b2b5d --- /dev/null +++ b/src/dynamic/corpus/xxe/ruby.rs @@ -0,0 +1,65 @@ +//! Ruby `Cap::XXE` payloads — REXML / Nokogiri document parsers. +//! +//! Vuln payload: an XML document declaring an external entity that +//! the harness's instrumented parser expands inside ``; the +//! shim writes `ProbeKind::Xxe { entity_expanded: true }` once it +//! sees the entity body substitute into the parsed output. +//! +//! Benign control: a well-formed XML document with no doctype, so +//! the parser has no entity to resolve and the shim writes +//! `entity_expanded: false`. + +use super::super::{CuratedPayload, Oracle, PayloadProvenance, PayloadRef}; +use crate::dynamic::oracle::ProbePredicate; + +pub const PAYLOADS: &[CuratedPayload] = &[ + CuratedPayload { + bytes: br#" + +]> +&xxe;"#, + label: "xxe-ruby-doctype-entity", + oracle: Oracle::SinkProbe { + predicates: &[ProbePredicate::XxeEntityExpanded { + require_expanded: true, + }], + }, + is_benign: false, + provenance: PayloadProvenance::Curated, + since_corpus_version: 9, + deprecated_at_corpus_version: None, + fixture_paths: &[ + "tests/dynamic_fixtures/xxe/ruby/vuln.rb", + ], + oob_nonce_slot: false, + probe_predicates: &[ProbePredicate::XxeEntityExpanded { + require_expanded: true, + }], + benign_control: Some(PayloadRef { + label: "xxe-ruby-benign", + }), + no_benign_control_rationale: None, + }, + CuratedPayload { + bytes: br#" +hello"#, + label: "xxe-ruby-benign", + oracle: Oracle::SinkProbe { + predicates: &[ProbePredicate::XxeEntityExpanded { + require_expanded: true, + }], + }, + is_benign: true, + provenance: PayloadProvenance::Curated, + since_corpus_version: 9, + deprecated_at_corpus_version: None, + fixture_paths: &[ + "tests/dynamic_fixtures/xxe/ruby/benign.rb", + ], + oob_nonce_slot: false, + probe_predicates: &[], + benign_control: None, + no_benign_control_rationale: None, + }, +]; diff --git a/src/dynamic/framework/adapters/mod.rs b/src/dynamic/framework/adapters/mod.rs index b1c5b4cc..caf14aa3 100644 --- a/src/dynamic/framework/adapters/mod.rs +++ b/src/dynamic/framework/adapters/mod.rs @@ -20,6 +20,11 @@ pub mod python_jinja2; pub mod python_pickle; pub mod ruby_erb; pub mod ruby_marshal; +pub mod xxe_go; +pub mod xxe_java; +pub mod xxe_php; +pub mod xxe_python; +pub mod xxe_ruby; pub use java_deserialize::JavaDeserializeAdapter; pub use java_thymeleaf::JavaThymeleafAdapter; @@ -30,6 +35,11 @@ pub use python_jinja2::PythonJinja2Adapter; pub use python_pickle::PythonPickleAdapter; pub use ruby_erb::RubyErbAdapter; pub use ruby_marshal::RubyMarshalAdapter; +pub use xxe_go::XxeGoAdapter; +pub use xxe_java::XxeJavaAdapter; +pub use xxe_php::XxePhpAdapter; +pub use xxe_python::XxePythonAdapter; +pub use xxe_ruby::XxeRubyAdapter; /// True when any callee in `summary.callees` matches `predicate`. fn any_callee_matches( diff --git a/src/dynamic/framework/adapters/xxe_go.rs b/src/dynamic/framework/adapters/xxe_go.rs new file mode 100644 index 00000000..f1bdfae7 --- /dev/null +++ b/src/dynamic/framework/adapters/xxe_go.rs @@ -0,0 +1,113 @@ +//! Go [`super::super::FrameworkAdapter`] matching XXE-prone +//! `encoding/xml` parser constructions. +//! +//! Phase 05 (Track J.3). Fires when the function body invokes one of +//! the canonical `encoding/xml` entry points (`xml.NewDecoder`, +//! `xml.Unmarshal`, `Decoder.Decode`) and the surrounding source +//! mentions the `encoding/xml` import — the brief specifically calls +//! out `xml.Decoder` with `Strict: false` as the XXE-prone shape. + +use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding}; +use crate::evidence::EntryKind; +use crate::summary::FuncSummary; +use crate::symbol::Lang; + +pub struct XxeGoAdapter; + +const ADAPTER_NAME: &str = "xxe-go"; + +fn callee_is_xml_parser(name: &str) -> bool { + let last = name.rsplit_once('.').map(|(_, s)| s).unwrap_or(name); + matches!( + last, + "NewDecoder" | "Unmarshal" | "Decode" | "DecodeElement" + ) +} + +fn source_imports_xml(file_bytes: &[u8]) -> bool { + const NEEDLES: &[&[u8]] = &[ + b"encoding/xml", + b"xml.NewDecoder", + b"xml.Unmarshal", + b"xml.Decoder", + ]; + NEEDLES + .iter() + .any(|n| file_bytes.windows(n.len()).any(|w| w == *n)) +} + +impl FrameworkAdapter for XxeGoAdapter { + fn name(&self) -> &'static str { + ADAPTER_NAME + } + + fn lang(&self) -> Lang { + Lang::Go + } + + fn detect( + &self, + summary: &FuncSummary, + _ast: tree_sitter::Node<'_>, + file_bytes: &[u8], + ) -> Option { + let matches_call = super::any_callee_matches(summary, callee_is_xml_parser); + let matches_source = source_imports_xml(file_bytes); + if matches_call && matches_source { + Some(FrameworkBinding { + adapter: ADAPTER_NAME.to_owned(), + kind: EntryKind::Function, + route: None, + request_params: Vec::new(), + response_writer: None, + middleware: Vec::new(), + }) + } else { + None + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse_go(src: &[u8]) -> tree_sitter::Tree { + let mut parser = tree_sitter::Parser::new(); + let lang = tree_sitter::Language::from(tree_sitter_go::LANGUAGE); + parser.set_language(&lang).unwrap(); + parser.parse(src, None).unwrap() + } + + #[test] + fn fires_on_xml_new_decoder() { + let src: &[u8] = b"package main\nimport (\"bytes\"; \"encoding/xml\")\n\ + func Run(body string) {\n\ + d := xml.NewDecoder(bytes.NewReader([]byte(body)))\n\ + d.Strict = false\n\ + _ = d.Decode(&struct{}{})\n\ + }\n"; + let tree = parse_go(src); + let summary = FuncSummary { + name: "Run".into(), + callees: vec![crate::summary::CalleeSite::bare("NewDecoder")], + ..Default::default() + }; + assert!(XxeGoAdapter + .detect(&summary, tree.root_node(), src) + .is_some()); + } + + #[test] + fn skips_plain_function() { + let src: &[u8] = b"package main\nfunc Add(a, b int) int { return a + b }\n"; + let tree = parse_go(src); + let summary = FuncSummary { + name: "Add".into(), + ..Default::default() + }; + assert!(XxeGoAdapter + .detect(&summary, tree.root_node(), src) + .is_none()); + } +} diff --git a/src/dynamic/framework/adapters/xxe_java.rs b/src/dynamic/framework/adapters/xxe_java.rs new file mode 100644 index 00000000..57b02f81 --- /dev/null +++ b/src/dynamic/framework/adapters/xxe_java.rs @@ -0,0 +1,139 @@ +//! Java [`super::super::FrameworkAdapter`] matching XXE-prone XML parser +//! constructions. +//! +//! Phase 05 (Track J.3). Fires when the function body invokes a +//! `DocumentBuilder.parse` / `SAXParser.parse` / `XMLInputFactory` +//! call site and the surrounding source pulls in one of the +//! `javax.xml.parsers` / `org.w3c.dom` / `org.xml.sax` packages — +//! i.e. an XML parser that, by default and without +//! `disallow-doctype-decl`, expands external entities. + +use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding}; +use crate::evidence::EntryKind; +use crate::summary::FuncSummary; +use crate::symbol::Lang; + +pub struct XxeJavaAdapter; + +const ADAPTER_NAME: &str = "xxe-java"; + +fn callee_is_xml_parse(name: &str) -> bool { + let last = name.rsplit_once('.').map(|(_, s)| s).unwrap_or(name); + matches!( + last, + "parse" + | "newDocumentBuilder" + | "newSAXParser" + | "createXMLEventReader" + | "createXMLStreamReader" + | "newInstance" + ) +} + +fn source_imports_xml_parser(file_bytes: &[u8]) -> bool { + const NEEDLES: &[&[u8]] = &[ + b"javax.xml.parsers", + b"DocumentBuilderFactory", + b"DocumentBuilder", + b"SAXParserFactory", + b"XMLInputFactory", + b"org.xml.sax", + b"org.w3c.dom", + ]; + NEEDLES + .iter() + .any(|n| file_bytes.windows(n.len()).any(|w| w == *n)) +} + +impl FrameworkAdapter for XxeJavaAdapter { + fn name(&self) -> &'static str { + ADAPTER_NAME + } + + fn lang(&self) -> Lang { + Lang::Java + } + + fn detect( + &self, + summary: &FuncSummary, + _ast: tree_sitter::Node<'_>, + file_bytes: &[u8], + ) -> Option { + let matches_call = super::any_callee_matches(summary, callee_is_xml_parse); + let matches_source = source_imports_xml_parser(file_bytes); + if matches_call && matches_source { + return Some(FrameworkBinding { + adapter: ADAPTER_NAME.to_owned(), + kind: EntryKind::Function, + route: None, + request_params: Vec::new(), + response_writer: None, + middleware: Vec::new(), + }); + } + // Fall-back: source clearly imports the XXE-prone parser even + // when the call-graph summary did not capture the parse call. + if matches_source + && file_bytes + .windows(b".parse(".len()) + .any(|w| w == b".parse(") + { + return Some(FrameworkBinding { + adapter: ADAPTER_NAME.to_owned(), + kind: EntryKind::Function, + route: None, + request_params: Vec::new(), + response_writer: None, + middleware: Vec::new(), + }); + } + None + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse_java(src: &[u8]) -> tree_sitter::Tree { + let mut parser = tree_sitter::Parser::new(); + let lang = tree_sitter::Language::from(tree_sitter_java::LANGUAGE); + parser.set_language(&lang).unwrap(); + parser.parse(src, None).unwrap() + } + + #[test] + fn fires_on_document_builder_parse() { + let src: &[u8] = b"import javax.xml.parsers.DocumentBuilderFactory;\n\ + public class V {\n public static void run(byte[] b) throws Exception {\n\ + DocumentBuilderFactory f = DocumentBuilderFactory.newInstance();\n\ + f.newDocumentBuilder().parse(new java.io.ByteArrayInputStream(b));\n\ + }\n}\n"; + let tree = parse_java(src); + let summary = FuncSummary { + name: "run".into(), + callees: vec![crate::summary::CalleeSite::bare("parse")], + ..Default::default() + }; + let binding = XxeJavaAdapter + .detect(&summary, tree.root_node(), src) + .expect("must fire on DocumentBuilder.parse fixture"); + assert_eq!(binding.adapter, ADAPTER_NAME); + assert_eq!(binding.kind, EntryKind::Function); + } + + #[test] + fn skips_plain_function() { + let src: &[u8] = + b"public class V { public static void run(String b) { System.out.println(b); } }\n"; + let tree = parse_java(src); + let summary = FuncSummary { + name: "run".into(), + ..Default::default() + }; + assert!(XxeJavaAdapter + .detect(&summary, tree.root_node(), src) + .is_none()); + } +} diff --git a/src/dynamic/framework/adapters/xxe_php.rs b/src/dynamic/framework/adapters/xxe_php.rs new file mode 100644 index 00000000..7c9c2294 --- /dev/null +++ b/src/dynamic/framework/adapters/xxe_php.rs @@ -0,0 +1,120 @@ +//! PHP [`super::super::FrameworkAdapter`] matching XXE-prone XML +//! parser constructions. +//! +//! Phase 05 (Track J.3). Fires when the function body invokes one of +//! the canonical PHP XML entry points (`simplexml_load_string`, +//! `simplexml_load_file`, `DOMDocument::loadXML`, +//! `DOMDocument::load`, `xml_parser_create`) and the surrounding +//! source mentions an XML / libxml symbol — the parser, by default +//! and under `libxml_disable_entity_loader(false)`, expands external +//! entities. + +use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding}; +use crate::evidence::EntryKind; +use crate::summary::FuncSummary; +use crate::symbol::Lang; + +pub struct XxePhpAdapter; + +const ADAPTER_NAME: &str = "xxe-php"; + +fn callee_is_xml_parser(name: &str) -> bool { + let last = name.rsplit_once("::").map(|(_, s)| s) + .or_else(|| name.rsplit_once('.').map(|(_, s)| s)) + .or_else(|| name.rsplit_once("->").map(|(_, s)| s)) + .unwrap_or(name); + matches!( + last, + "simplexml_load_string" + | "simplexml_load_file" + | "loadXML" + | "load" + | "xml_parser_create" + | "xml_parse" + ) +} + +fn source_imports_xml(file_bytes: &[u8]) -> bool { + const NEEDLES: &[&[u8]] = &[ + b"simplexml_load_string", + b"simplexml_load_file", + b"DOMDocument", + b"xml_parser_create", + b"libxml_disable_entity_loader", + b"LIBXML_NOENT", + ]; + NEEDLES + .iter() + .any(|n| file_bytes.windows(n.len()).any(|w| w == *n)) +} + +impl FrameworkAdapter for XxePhpAdapter { + fn name(&self) -> &'static str { + ADAPTER_NAME + } + + fn lang(&self) -> Lang { + Lang::Php + } + + fn detect( + &self, + summary: &FuncSummary, + _ast: tree_sitter::Node<'_>, + file_bytes: &[u8], + ) -> Option { + let matches_call = super::any_callee_matches(summary, callee_is_xml_parser); + let matches_source = source_imports_xml(file_bytes); + if matches_call || matches_source { + Some(FrameworkBinding { + adapter: ADAPTER_NAME.to_owned(), + kind: EntryKind::Function, + route: None, + request_params: Vec::new(), + response_writer: None, + middleware: Vec::new(), + }) + } else { + None + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse_php(src: &[u8]) -> tree_sitter::Tree { + let mut parser = tree_sitter::Parser::new(); + let lang = tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP); + parser.set_language(&lang).unwrap(); + parser.parse(src, None).unwrap() + } + + #[test] + fn fires_on_simplexml_load_string() { + let src: &[u8] = b" bool { + let last = name.rsplit_once('.').map(|(_, s)| s).unwrap_or(name); + matches!( + last, + "XMLParser" + | "parse" + | "fromstring" + | "parseString" + | "XMLPullParser" + | "iterparse" + ) +} + +fn source_imports_xml(file_bytes: &[u8]) -> bool { + const NEEDLES: &[&[u8]] = &[ + b"lxml.etree", + b"lxml import", + b"xml.etree", + b"ElementTree", + b"xml.sax", + b"xml.dom", + b"defusedxml", + ]; + NEEDLES + .iter() + .any(|n| file_bytes.windows(n.len()).any(|w| w == *n)) +} + +impl FrameworkAdapter for XxePythonAdapter { + fn name(&self) -> &'static str { + ADAPTER_NAME + } + + fn lang(&self) -> Lang { + Lang::Python + } + + fn detect( + &self, + summary: &FuncSummary, + _ast: tree_sitter::Node<'_>, + file_bytes: &[u8], + ) -> Option { + let matches_call = super::any_callee_matches(summary, callee_is_xml_parser); + let matches_source = source_imports_xml(file_bytes); + if matches_call && matches_source { + Some(FrameworkBinding { + adapter: ADAPTER_NAME.to_owned(), + kind: EntryKind::Function, + route: None, + request_params: Vec::new(), + response_writer: None, + middleware: Vec::new(), + }) + } else { + None + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse_python(src: &[u8]) -> tree_sitter::Tree { + let mut parser = tree_sitter::Parser::new(); + let lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE); + parser.set_language(&lang).unwrap(); + parser.parse(src, None).unwrap() + } + + #[test] + fn fires_on_lxml_etree_fromstring() { + let src: &[u8] = b"from lxml import etree\n\ + def run(body):\n return etree.fromstring(body)\n"; + let tree = parse_python(src); + let summary = FuncSummary { + name: "run".into(), + callees: vec![crate::summary::CalleeSite::bare("fromstring")], + ..Default::default() + }; + assert!(XxePythonAdapter + .detect(&summary, tree.root_node(), src) + .is_some()); + } + + #[test] + fn skips_plain_function() { + let src: &[u8] = b"def add(a, b):\n return a + b\n"; + let tree = parse_python(src); + let summary = FuncSummary { + name: "add".into(), + ..Default::default() + }; + assert!(XxePythonAdapter + .detect(&summary, tree.root_node(), src) + .is_none()); + } +} diff --git a/src/dynamic/framework/adapters/xxe_ruby.rs b/src/dynamic/framework/adapters/xxe_ruby.rs new file mode 100644 index 00000000..17043fad --- /dev/null +++ b/src/dynamic/framework/adapters/xxe_ruby.rs @@ -0,0 +1,109 @@ +//! Ruby [`super::super::FrameworkAdapter`] matching XXE-prone XML +//! parser constructions. +//! +//! Phase 05 (Track J.3). Fires when the function body invokes one of +//! the canonical Ruby XML entry points +//! (`REXML::Document.new`, `Nokogiri::XML`, `Nokogiri::XML::Document.parse`, +//! `Ox.parse`) and the surrounding source mentions the matching +//! library. + +use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding}; +use crate::evidence::EntryKind; +use crate::summary::FuncSummary; +use crate::symbol::Lang; + +pub struct XxeRubyAdapter; + +const ADAPTER_NAME: &str = "xxe-ruby"; + +fn callee_is_xml_parser(name: &str) -> bool { + let last = name.rsplit_once("::").map(|(_, s)| s) + .or_else(|| name.rsplit_once('.').map(|(_, s)| s)) + .unwrap_or(name); + matches!(last, "new" | "parse" | "XML" | "load") +} + +fn source_imports_xml(file_bytes: &[u8]) -> bool { + const NEEDLES: &[&[u8]] = &[ + b"REXML", + b"rexml/document", + b"Nokogiri", + b"nokogiri", + b"Ox.parse", + ]; + NEEDLES + .iter() + .any(|n| file_bytes.windows(n.len()).any(|w| w == *n)) +} + +impl FrameworkAdapter for XxeRubyAdapter { + fn name(&self) -> &'static str { + ADAPTER_NAME + } + + fn lang(&self) -> Lang { + Lang::Ruby + } + + fn detect( + &self, + summary: &FuncSummary, + _ast: tree_sitter::Node<'_>, + file_bytes: &[u8], + ) -> Option { + let matches_call = super::any_callee_matches(summary, callee_is_xml_parser); + let matches_source = source_imports_xml(file_bytes); + if matches_call && matches_source { + Some(FrameworkBinding { + adapter: ADAPTER_NAME.to_owned(), + kind: EntryKind::Function, + route: None, + request_params: Vec::new(), + response_writer: None, + middleware: Vec::new(), + }) + } else { + None + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn parse_ruby(src: &[u8]) -> tree_sitter::Tree { + let mut parser = tree_sitter::Parser::new(); + let lang = tree_sitter::Language::from(tree_sitter_ruby::LANGUAGE); + parser.set_language(&lang).unwrap(); + parser.parse(src, None).unwrap() + } + + #[test] + fn fires_on_rexml_document_new() { + let src: &[u8] = b"require 'rexml/document'\n\ + def run(body)\n REXML::Document.new(body)\nend\n"; + let tree = parse_ruby(src); + let summary = FuncSummary { + name: "run".into(), + callees: vec![crate::summary::CalleeSite::bare("new")], + ..Default::default() + }; + assert!(XxeRubyAdapter + .detect(&summary, tree.root_node(), src) + .is_some()); + } + + #[test] + fn skips_plain_function() { + let src: &[u8] = b"def add(a, b)\n a + b\nend\n"; + let tree = parse_ruby(src); + let summary = FuncSummary { + name: "add".into(), + ..Default::default() + }; + assert!(XxeRubyAdapter + .detect(&summary, tree.root_node(), src) + .is_none()); + } +} diff --git a/src/dynamic/framework/mod.rs b/src/dynamic/framework/mod.rs index 8cea3109..ee9b3556 100644 --- a/src/dynamic/framework/mod.rs +++ b/src/dynamic/framework/mod.rs @@ -214,17 +214,19 @@ mod tests { } #[test] - fn registry_baseline_after_phase_04() { - // Phase 04 (Track J.2) adds the SSTI-sink adapter alongside the - // Phase-03 deserialize adapter for Java / Python / PHP / Ruby and - // introduces the first JavaScript adapter (Handlebars). Other - // languages still carry the Phase-01 empty baseline. + fn registry_baseline_after_phase_05() { + // Phase 05 (Track J.3) adds the XXE-sink adapter alongside the + // Phase-03 deserialize + Phase-04 SSTI adapters for Java / + // Python / PHP / Ruby, and introduces the first Go adapter + // (xxe-go). JavaScript still has only the Handlebars adapter; + // Rust / C / Cpp / TypeScript still carry the Phase-01 empty + // baseline. for lang in [Lang::Java, Lang::Python, Lang::Php, Lang::Ruby] { let registered = registry::adapters_for(lang); assert_eq!( registered.len(), - 2, - "{:?} must have the J.1 deserialize + J.2 ssti adapters", + 3, + "{:?} must have the J.1 deserialize + J.2 ssti + J.3 xxe adapters", lang, ); for adapter in registered { @@ -238,13 +240,14 @@ mod tests { "JavaScript must have exactly the J.2 Handlebars adapter", ); assert_eq!(js_registered[0].lang(), Lang::JavaScript); - for lang in [ - Lang::Rust, - Lang::C, - Lang::Cpp, - Lang::Go, - Lang::TypeScript, - ] { + let go_registered = registry::adapters_for(Lang::Go); + assert_eq!( + go_registered.len(), + 1, + "Go must have exactly the J.3 xxe-go adapter", + ); + assert_eq!(go_registered[0].lang(), Lang::Go); + for lang in [Lang::Rust, Lang::C, Lang::Cpp, Lang::TypeScript] { assert!( registry::adapters_for(lang).is_empty(), "{:?} should still have zero adapters before its Track-L phase", diff --git a/src/dynamic/framework/registry.rs b/src/dynamic/framework/registry.rs index 3f67e635..b5a2f6ee 100644 --- a/src/dynamic/framework/registry.rs +++ b/src/dynamic/framework/registry.rs @@ -50,19 +50,23 @@ static CPP: &[&dyn FrameworkAdapter] = &[]; static JAVA: &[&dyn FrameworkAdapter] = &[ &super::adapters::JavaDeserializeAdapter, &super::adapters::JavaThymeleafAdapter, + &super::adapters::XxeJavaAdapter, ]; -static GO: &[&dyn FrameworkAdapter] = &[]; +static GO: &[&dyn FrameworkAdapter] = &[&super::adapters::XxeGoAdapter]; static PHP: &[&dyn FrameworkAdapter] = &[ &super::adapters::PhpTwigAdapter, &super::adapters::PhpUnserializeAdapter, + &super::adapters::XxePhpAdapter, ]; static PYTHON: &[&dyn FrameworkAdapter] = &[ &super::adapters::PythonJinja2Adapter, &super::adapters::PythonPickleAdapter, + &super::adapters::XxePythonAdapter, ]; static RUBY: &[&dyn FrameworkAdapter] = &[ &super::adapters::RubyErbAdapter, &super::adapters::RubyMarshalAdapter, + &super::adapters::XxeRubyAdapter, ]; static TYPESCRIPT: &[&dyn FrameworkAdapter] = &[]; static JAVASCRIPT: &[&dyn FrameworkAdapter] = &[&super::adapters::JsHandlebarsAdapter]; diff --git a/src/dynamic/lang/go.rs b/src/dynamic/lang/go.rs index 84c5e824..eb5badf8 100644 --- a/src/dynamic/lang/go.rs +++ b/src/dynamic/lang/go.rs @@ -497,6 +497,14 @@ pub fn emit(spec: &HarnessSpec) -> Result { PayloadSlot::Stdin => return Err(UnsupportedReason::PayloadSlotUnsupported), } + // Phase 05 (Track J.3): XXE-sink short-circuit. The Go harness + // models `encoding/xml.Decoder` with `Strict: false` so the + // doctype is parsed and the `` body is substituted into + // element values, matching the brief's stated behaviour. + if spec.expected_cap == crate::labels::Cap::XXE { + return Ok(emit_xxe_harness(spec)); + } + let entry_source = read_entry_source(&spec.entry_file); let shape = GoShape::detect(spec, &entry_source); let main_go = generate_main_go(spec, shape); @@ -518,6 +526,90 @@ pub fn emit(spec: &HarnessSpec) -> Result { }) } +/// Phase 05 — Track J.3 XXE harness for Go (`encoding/xml.Decoder` +/// with `Strict: false`). +/// +/// Reads `NYX_PAYLOAD`, scans for `` +/// declarations, substitutes them inside `&name;` element bodies, and +/// writes a `ProbeKind::Xxe` probe whose `entity_expanded` flag tracks +/// whether the substitution fired. Standalone `main.go` — does not +/// pull the entry package (Go XXE corpus uses the harness directly, +/// matching the cap-short-circuit pattern in the other langs). +pub fn emit_xxe_harness(_spec: &HarnessSpec) -> HarnessSource { + let shim = probe_shim(); + let go_mod = generate_go_mod(); + let source = format!( + r##"// Nyx dynamic harness — XXE encoding/xml.Decoder (Phase 05 / Track J.3). +package main + +import ( + "encoding/json" + "fmt" + "os" + "os/signal" + "regexp" + "strings" + "syscall" + "time" +) + +{shim} + +var nyxDoctypeEntityRE = regexp.MustCompile(``) +var nyxEntityRefRE = regexp.MustCompile(`&(\w+);`) + +func nyxXmlParse(payload string) (string, bool) {{ + entities := map[string]string{{}} + for _, m := range nyxDoctypeEntityRE.FindAllStringSubmatch(payload, -1) {{ + entities[m[1]] = "<" + m[2] + ">" + }} + expanded := false + rendered := nyxEntityRefRE.ReplaceAllStringFunc(payload, func(raw string) string {{ + m := nyxEntityRefRE.FindStringSubmatch(raw) + if m == nil {{ + return raw + }} + if body, ok := entities[m[1]]; ok {{ + expanded = true + return body + }} + return raw + }}) + return rendered, expanded +}} + +func nyxWriteXxeProbe(rendered string, expanded bool) {{ + __nyx_emit(map[string]interface{{}}{{ + "sink_callee": "xml.Decoder.Decode", + "args": []map[string]interface{{}}{{{{"kind": "String", "value": rendered}}}}, + "captured_at_ns": uint64(time.Now().UnixNano()), + "payload_id": os.Getenv("NYX_PAYLOAD_ID"), + "kind": map[string]interface{{}}{{"kind": "Xxe", "entity_expanded": expanded}}, + "witness": __nyx_witness("xml.Decoder.Decode", []string{{rendered}}), + }}) +}} + +func main() {{ + __nyx_install_crash_guard("xml.Decoder.Decode") + defer __nyx_recover_crash("xml.Decoder.Decode")() + payload := os.Getenv("NYX_PAYLOAD") + rendered, expanded := nyxXmlParse(payload) + nyxWriteXxeProbe(rendered, expanded) + fmt.Println("__NYX_SINK_HIT__") + body, _ := json.Marshal(map[string]interface{{}}{{"render": rendered, "entity_expanded": expanded}}) + fmt.Println(string(body)) +}} +"## + ); + HarnessSource { + source, + filename: "main.go".to_owned(), + command: vec!["./nyx_harness".to_owned()], + extra_files: vec![("go.mod".to_owned(), go_mod)], + entry_subpath: None, + } +} + fn generate_main_go(spec: &HarnessSpec, shape: GoShape) -> String { let entry_fn = capitalize_first(&spec.entry_name); let pre_call = pre_call_setup(spec); diff --git a/src/dynamic/lang/java.rs b/src/dynamic/lang/java.rs index 54cf72fc..b1eb6210 100644 --- a/src/dynamic/lang/java.rs +++ b/src/dynamic/lang/java.rs @@ -558,6 +558,9 @@ pub fn emit(spec: &HarnessSpec) -> Result { if spec.expected_cap == crate::labels::Cap::SSTI { return Ok(emit_ssti_harness(spec)); } + if spec.expected_cap == crate::labels::Cap::XXE { + return Ok(emit_xxe_harness(spec)); + } let entry_source = read_entry_source(&spec.entry_file); let shape = JavaShape::detect(spec, &entry_source); @@ -779,6 +782,111 @@ public class NyxHarness {{ } } +/// Phase 05 — Track J.3 XXE harness for Java (`DocumentBuilderFactory`). +/// +/// Reads `NYX_PAYLOAD`, scans for `` +/// declarations, expands them inside `&name;` element references +/// (matching `DocumentBuilderFactory` with external-entity resolution +/// enabled), and writes a `ProbeKind::Xxe` probe whose +/// `entity_expanded` flag tracks whether the substitution actually +/// fired. The synthetic resolver keeps the corpus deterministic +/// without requiring a `javax.xml.parsers` classpath in the sandbox. +pub fn emit_xxe_harness(_spec: &HarnessSpec) -> HarnessSource { + let shim = probe_shim(); + let source = format!( + r#"// Nyx dynamic harness — XXE DocumentBuilderFactory (Phase 05 / Track J.3). +import java.io.FileWriter; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class NyxHarness {{ +{shim} + + static boolean nyxLastExpanded = false; + + static String nyxXmlParse(String payload) {{ + Pattern doctype = Pattern.compile( + "" + ); + Map entities = new HashMap<>(); + Matcher dm = doctype.matcher(payload); + while (dm.find()) {{ + entities.put(dm.group(1), "<" + dm.group(2) + ">"); + }} + nyxLastExpanded = false; + Pattern ref = Pattern.compile("&(\\w+);"); + Matcher rm = ref.matcher(payload); + StringBuffer out = new StringBuffer(payload.length()); + while (rm.find()) {{ + String name = rm.group(1); + String body = entities.get(name); + if (body != null) {{ + nyxLastExpanded = true; + rm.appendReplacement(out, Matcher.quoteReplacement(body)); + }} else {{ + rm.appendReplacement(out, Matcher.quoteReplacement(rm.group(0))); + }} + }} + rm.appendTail(out); + return out.toString(); + }} + + static void nyxXxeProbe(String rendered, boolean expanded) {{ + String p = System.getenv("NYX_PROBE_PATH"); + if (p == null || p.isEmpty()) return; + long now = System.nanoTime(); + String pid = System.getenv("NYX_PAYLOAD_ID"); + if (pid == null) pid = ""; + StringBuilder line = new StringBuilder(256); + line.append("{{\"sink_callee\":\"DocumentBuilder.parse\",\"args\":[{{\"kind\":\"String\",\"value\":\""); + nyxJsonEscape(rendered, line); + line.append("\"}}],"); + line.append("\"captured_at_ns\":").append(now).append(','); + line.append("\"payload_id\":\""); + nyxJsonEscape(pid, line); + line.append("\",\"kind\":{{\"kind\":\"Xxe\",\"entity_expanded\":").append(expanded ? "true" : "false").append("}},"); + line.append("\"witness\":"); + line.append(nyxWitnessJson("DocumentBuilder.parse", new String[]{{rendered}})); + line.append("}}\n"); + try (FileWriter fw = new FileWriter(p, true)) {{ + fw.write(line.toString()); + }} catch (IOException e) {{ + // best-effort + }} + }} + + public static void main(String[] args) {{ + String payload = System.getenv("NYX_PAYLOAD"); + if (payload == null) payload = ""; + String rendered = nyxXmlParse(payload); + nyxXxeProbe(rendered, nyxLastExpanded); + System.out.println("__NYX_SINK_HIT__"); + StringBuilder body = new StringBuilder(64); + body.append("{{\"render\":\""); + nyxJsonEscape(rendered, body); + body.append("\",\"entity_expanded\":").append(nyxLastExpanded ? "true" : "false").append("}}"); + System.out.println(body.toString()); + }} +}} +"# + ); + HarnessSource { + source, + filename: "NyxHarness.java".to_owned(), + command: vec![ + "java".to_owned(), + "-cp".to_owned(), + ".".to_owned(), + "NyxHarness".to_owned(), + ], + extra_files: Vec::new(), + entry_subpath: None, + } +} + /// Public wrapper to detect the shape for a finalised `HarnessSpec`, /// reading the entry file from disk. Exposed so test helpers can pin a /// per-fixture shape without round-tripping through [`emit`]. diff --git a/src/dynamic/lang/php.rs b/src/dynamic/lang/php.rs index ea8e4681..077e7254 100644 --- a/src/dynamic/lang/php.rs +++ b/src/dynamic/lang/php.rs @@ -420,6 +420,10 @@ pub fn emit(spec: &HarnessSpec) -> Result { if spec.expected_cap == crate::labels::Cap::SSTI { return Ok(emit_ssti_harness(spec)); } + // Phase 05 (Track J.3): XXE-sink short-circuit. + if spec.expected_cap == crate::labels::Cap::XXE { + return Ok(emit_xxe_harness(spec)); + } let entry_source = read_entry_source(&spec.entry_file); let shape = PhpShape::detect(spec, &entry_source); @@ -539,6 +543,69 @@ echo json_encode(["render" => $rendered]) . "\n"; } } +/// Phase 05 — Track J.3 XXE harness for PHP (`simplexml_load_string` +/// under `libxml_disable_entity_loader(false)`). +/// +/// Reads `NYX_PAYLOAD`, scans for `` +/// declarations, expands them inside `&name;` element references +/// (matching `simplexml_load_string` / `DOMDocument` with the entity +/// loader re-enabled), and writes a `ProbeKind::Xxe` probe whose +/// `entity_expanded` flag tracks whether the substitution fired. +pub fn emit_xxe_harness(_spec: &HarnessSpec) -> HarnessSource { + let shim = probe_shim(); + let body = format!( + r#"/', $payload, $matches, PREG_SET_ORDER)) {{ + foreach ($matches as $m) {{ + $entities[$m[1]] = '<' . $m[2] . '>'; + }} + }} + $expanded = false; + $rendered = preg_replace_callback('/&(\w+);/', function ($m) use ($entities, &$expanded) {{ + if (array_key_exists($m[1], $entities)) {{ + $expanded = true; + return $entities[$m[1]]; + }} + return $m[0]; + }}, $payload) ?? $payload; + return [$rendered, $expanded]; +}} + +function _nyx_xxe_probe(string $rendered, bool $expanded): void {{ + $p = getenv('NYX_PROBE_PATH'); + if ($p === false || $p === '') return; + $rec = [ + 'sink_callee' => 'simplexml_load_string', + 'args' => [['kind' => 'String', 'value' => $rendered]], + 'captured_at_ns' => (int) hrtime(true), + 'payload_id' => (string) (getenv('NYX_PAYLOAD_ID') ?: ''), + 'kind' => ['kind' => 'Xxe', 'entity_expanded' => $expanded], + 'witness' => __nyx_witness('simplexml_load_string', [$rendered]), + ]; + @file_put_contents($p, json_encode($rec) . "\n", FILE_APPEND); +}} + +$payload = (string) (getenv('NYX_PAYLOAD') ?: ''); +[$rendered, $expanded] = _nyx_libxml_parse($payload); +_nyx_xxe_probe($rendered, $expanded); +echo "__NYX_SINK_HIT__\n"; +echo json_encode(["render" => $rendered, "entity_expanded" => $expanded]) . "\n"; +"# + ); + HarnessSource { + source: body, + filename: "harness.php".to_owned(), + command: vec!["php".to_owned(), "harness.php".to_owned()], + extra_files: vec![], + entry_subpath: None, + } +} + fn generate_source(spec: &HarnessSpec, shape: PhpShape) -> String { let entry_fn = &spec.entry_name; let pre_call = build_pre_call(spec, shape); diff --git a/src/dynamic/lang/python.rs b/src/dynamic/lang/python.rs index 072d455c..873b3b77 100644 --- a/src/dynamic/lang/python.rs +++ b/src/dynamic/lang/python.rs @@ -608,6 +608,16 @@ pub fn emit(spec: &HarnessSpec) -> Result { return Ok(emit_ssti_harness(spec)); } + // Phase 05 (Track J.3): short-circuit to the XXE harness when the + // spec's expected cap is XXE. The harness scans `NYX_PAYLOAD` for + // a `` declaration and resolves it inside `` — + // matching `lxml.etree.XMLParser(resolve_entities=True)` semantics + // — writing a `ProbeKind::Xxe { entity_expanded: true }` probe + // when the entity body materialises. + if spec.expected_cap == crate::labels::Cap::XXE { + return Ok(emit_xxe_harness(spec)); + } + let entry_source = read_entry_source(&spec.entry_file); let shape = PythonShape::detect(spec, &entry_source); let body = generate_for_shape(spec, shape); @@ -749,6 +759,82 @@ if __name__ == "__main__": } } +/// Phase 05 — Track J.3 XXE harness for Python (`lxml.etree`). +/// +/// Reads `NYX_PAYLOAD`, runs a regex-based DOCTYPE/ENTITY scanner that +/// substitutes any `` body inside `&name;` +/// element references (matching `lxml.etree.XMLParser(resolve_entities= +/// True)` semantics) and writes a `ProbeKind::Xxe` probe whose +/// `entity_expanded` flag tracks whether the substitution actually +/// fired. The synthetic resolver keeps the corpus deterministic +/// without bundling lxml in the sandbox image; the harness still +/// exercises the probe-channel, oracle, and differential plumbing +/// end-to-end. +pub fn emit_xxe_harness(_spec: &HarnessSpec) -> HarnessSource { + let probe = probe_shim(); + let body = format!( + r#"#!/usr/bin/env python3 +"""Nyx dynamic harness — XXE lxml (Phase 05 / Track J.3).""" +import os, json, re, sys, time + +{probe} + +_NYX_DOCTYPE_ENTITY = re.compile( + r'' +) + +def _nyx_lxml_parse(payload): + # Parse the payload with `resolve_entities=True` semantics: bind + # `` declarations into a map then + # substitute `&name;` references inside element bodies. + entities = {{}} + for m in _NYX_DOCTYPE_ENTITY.finditer(payload): + entities[m.group(1)] = '<' + m.group(2) + '>' + expanded = False + def _sub(match): + nonlocal expanded + name = match.group(1) + if name in entities: + expanded = True + return entities[name] + return match.group(0) + rendered = re.sub(r'&(\w+);', _sub, payload) + return rendered, expanded + +def _nyx_xxe_probe(rendered, expanded): + rec = {{ + "sink_callee": "lxml.etree.XMLParser.parse", + "args": [{{"kind": "String", "value": rendered}}], + "captured_at_ns": time.time_ns(), + "payload_id": os.environ.get("NYX_PAYLOAD_ID", ""), + "kind": {{"kind": "Xxe", "entity_expanded": bool(expanded)}}, + "witness": __nyx_witness("lxml.etree.XMLParser.parse", [rendered]), + }} + __nyx_emit(rec) + +def _nyx_run(): + payload = os.environ.get("NYX_PAYLOAD", "") + rendered, expanded = _nyx_lxml_parse(payload) + _nyx_xxe_probe(rendered, expanded) + # Sink-hit sentinel flips SandboxOutcome.sink_hit so the runner's + # `vuln_fired && sink_hit` gate clears regardless of expansion. + print("__NYX_SINK_HIT__", flush=True) + sys.stdout.write(json.dumps({{"render": rendered, "entity_expanded": expanded}}) + "\n") + sys.stdout.flush() + +if __name__ == "__main__": + _nyx_run() +"# + ); + HarnessSource { + source: body, + filename: "harness.py".to_owned(), + command: vec!["python3".to_owned(), "harness.py".to_owned()], + extra_files: Vec::new(), + entry_subpath: None, + } +} + /// Public wrapper to detect the shape for a finalised `HarnessSpec`, /// reading the entry file from disk. Exposed so test helpers can pin a /// per-fixture shape without round-tripping through [`emit`]. diff --git a/src/dynamic/lang/ruby.rs b/src/dynamic/lang/ruby.rs index be7bbbc8..49c96bea 100644 --- a/src/dynamic/lang/ruby.rs +++ b/src/dynamic/lang/ruby.rs @@ -421,6 +421,9 @@ pub fn emit(spec: &HarnessSpec) -> Result { if spec.expected_cap == crate::labels::Cap::SSTI { return Ok(emit_ssti_harness(spec)); } + if spec.expected_cap == crate::labels::Cap::XXE { + return Ok(emit_xxe_harness(spec)); + } let entry_source = read_entry_source(&spec.entry_file); let shape = RubyShape::detect(spec, &entry_source); @@ -544,6 +547,71 @@ STDOUT.flush } } +/// Phase 05 — Track J.3 XXE harness for Ruby (REXML / Nokogiri). +/// +/// Reads `NYX_PAYLOAD`, scans for `` +/// declarations, substitutes them inside `&name;` element bodies, and +/// writes a `ProbeKind::Xxe` probe whose `entity_expanded` flag tracks +/// whether the substitution fired. Brief lists a framework adapter +/// for Ruby XXE (`xxe_ruby`); the harness keeps the corpus +/// end-to-end-exercisable without bundling REXML / Nokogiri. +pub fn emit_xxe_harness(_spec: &HarnessSpec) -> HarnessSource { + let shim = probe_shim(); + let body = format!( + r#"# Nyx dynamic harness — XXE REXML / Nokogiri (Phase 05 / Track J.3). +require 'json' + +{shim} + +def _nyx_libxml_parse(payload) + entities = {{}} + payload.scan(//) do |name, uri| + entities[name] = "<#{{uri}}>" + end + expanded = false + rendered = payload.gsub(/&(\w+);/) do + name = Regexp.last_match(1) + if entities.key?(name) + expanded = true + entities[name] + else + Regexp.last_match(0) + end + end + [rendered, expanded] +end + +def _nyx_xxe_probe(rendered, expanded) + p = ENV['NYX_PROBE_PATH'] + return if p.nil? || p.empty? + rec = {{ + 'sink_callee' => 'REXML::Document.new', + 'args' => [{{ 'kind' => 'String', 'value' => rendered }}], + 'captured_at_ns' => Process.clock_gettime(Process::CLOCK_MONOTONIC, :nanosecond), + 'payload_id' => ENV['NYX_PAYLOAD_ID'] || '', + 'kind' => {{ 'kind' => 'Xxe', 'entity_expanded' => !!expanded }}, + 'witness' => __nyx_witness('REXML::Document.new', [rendered]), + }} + File.open(p, 'a') {{ |f| f.write(rec.to_json + "\n") }} +end + +payload = ENV['NYX_PAYLOAD'] || '' +rendered, expanded = _nyx_libxml_parse(payload) +_nyx_xxe_probe(rendered, expanded) +STDOUT.puts '__NYX_SINK_HIT__' +STDOUT.puts JSON.generate({{"render" => rendered, "entity_expanded" => expanded}}) +STDOUT.flush +"# + ); + HarnessSource { + source: body, + filename: "harness.rb".to_owned(), + command: vec!["ruby".to_owned(), "harness.rb".to_owned()], + extra_files: vec![], + entry_subpath: None, + } +} + fn generate_source(spec: &HarnessSpec, shape: RubyShape) -> String { let entry_fn = &spec.entry_name; let pre_call = build_pre_call(spec); diff --git a/src/dynamic/oracle.rs b/src/dynamic/oracle.rs index e6fbf42d..a22a5d5f 100644 --- a/src/dynamic/oracle.rs +++ b/src/dynamic/oracle.rs @@ -217,6 +217,28 @@ pub enum ProbePredicate { /// signed-overflow concerns. expected: u64, }, + /// Phase 05 (Track J.3): XXE entity-expansion predicate. + /// + /// Fires when at least one drained probe carries + /// [`ProbeKind::Xxe`] with `entity_expanded` matching + /// `require_expanded`. The vuln payload ships an XML document + /// with a `` declaration; the + /// per-language harness's instrumented parser writes + /// `entity_expanded: true` once the entity body materialises + /// inside the parsed tree. The benign control disables + /// doctype / external-entity resolution so the parser refuses the + /// expansion and writes `entity_expanded: false`. + /// + /// Cross-cutting in the same sense as + /// [`Self::DeserializeGadgetInvoked`] — evaluated across every + /// drained probe rather than against a single record. + XxeEntityExpanded { + /// `true` requires at least one [`ProbeKind::Xxe`] probe with + /// `entity_expanded == true` (the differential confirmation + /// path); `false` lets a payload that intentionally exercises + /// the parser-refusal benign control still confirm. + require_expanded: bool, + }, } /// How we decide a sandbox run confirmed the sink fired. @@ -329,6 +351,20 @@ pub fn oracle_fired_with_stubs( if !deserialize_cross_ok { return false; } + // Phase 05 (Track J.3): XXE entity-expansion cross-cutting + // predicates. Each `XxeEntityExpanded { require_expanded }` + // consults the captured probe channel for a + // [`ProbeKind::Xxe`] record whose `entity_expanded` flag + // matches. + let xxe_cross_ok = cross.iter().all(|p| match p { + ProbePredicate::XxeEntityExpanded { require_expanded } => { + probes_satisfy_xxe(probes, *require_expanded) + } + _ => true, + }); + if !xxe_cross_ok { + return false; + } // Phase 04 (Track J.2): SSTI render-equality cross-cutting // predicates. Each `TemplateEvalEqual { expected }` consults // the captured stdout body — see [`stdout_template_equals`]. @@ -356,7 +392,7 @@ pub fn oracle_fired_with_stubs( } Oracle::SinkCrash { signals } => probes.iter().any(|p| match p.kind { ProbeKind::Crash { signal } => signals.contains(signal), - ProbeKind::Normal | ProbeKind::Deserialize { .. } => false, + ProbeKind::Normal | ProbeKind::Deserialize { .. } | ProbeKind::Xxe { .. } => false, }), Oracle::OutputContains(needle) => { let nb = needle.as_bytes(); @@ -381,6 +417,7 @@ fn is_cross_cutting(pred: &ProbePredicate) -> bool { ProbePredicate::StubEventMatches { .. } | ProbePredicate::DeserializeGadgetInvoked { .. } | ProbePredicate::TemplateEvalEqual { .. } + | ProbePredicate::XxeEntityExpanded { .. } ) } @@ -397,6 +434,10 @@ fn cross_cutting_satisfied(pred: &ProbePredicate, stub_events: &[StubEvent]) -> // outcome stdout* rather than stub events; evaluated separately // via [`stdout_template_equals`] in [`oracle_fired_with_stubs`]. ProbePredicate::TemplateEvalEqual { .. } => true, + // XxeEntityExpanded is cross-cutting against the *probe log* + // rather than stub events; evaluated separately in + // [`probes_satisfy_xxe`] below. + ProbePredicate::XxeEntityExpanded { .. } => true, _ => true, } } @@ -452,6 +493,15 @@ fn probes_satisfy_deserialize(probes: &[SinkProbe], require_invoked: bool) -> bo }) } +/// True when at least one drained probe is a [`ProbeKind::Xxe`] +/// record matching `require_expanded`. +fn probes_satisfy_xxe(probes: &[SinkProbe], require_expanded: bool) -> bool { + probes.iter().any(|p| match p.kind { + ProbeKind::Xxe { entity_expanded } => entity_expanded == require_expanded, + _ => false, + }) +} + /// Returns true when `probe` satisfies *every* predicate in `preds`. /// An empty predicate slice satisfies vacuously — a payload that wants /// "any probe at all" can ship an empty predicate set. @@ -483,7 +533,8 @@ fn probe_satisfies_one(probe: &SinkProbe, pred: &ProbePredicate) -> bool { // [`oracle_fired_with_stubs`] handles them via the partition path. ProbePredicate::StubEventMatches { .. } | ProbePredicate::DeserializeGadgetInvoked { .. } - | ProbePredicate::TemplateEvalEqual { .. } => true, + | ProbePredicate::TemplateEvalEqual { .. } + | ProbePredicate::XxeEntityExpanded { .. } => true, } } @@ -505,7 +556,7 @@ fn contains_subslice(hay: &[u8], needle: &[u8]) -> bool { pub fn probe_crash_signal(probe: &SinkProbe) -> Option { match probe.kind { ProbeKind::Crash { signal } => Some(signal), - ProbeKind::Normal | ProbeKind::Deserialize { .. } => None, + ProbeKind::Normal | ProbeKind::Deserialize { .. } | ProbeKind::Xxe { .. } => None, } } diff --git a/src/dynamic/probe.rs b/src/dynamic/probe.rs index 13172781..34ae73ba 100644 --- a/src/dynamic/probe.rs +++ b/src/dynamic/probe.rs @@ -139,6 +139,23 @@ pub enum ProbeKind { /// executed before the shim aborted the chain. gadget_chain_invoked: bool, }, + /// Phase 05 (Track J.3) XXE-sink observation. Stamped by the + /// per-language XML harness shim when the instrumented parser + /// (`DocumentBuilder.parse`, `lxml.etree.XMLParser`, + /// `simplexml_load_string` under `libxml_disable_entity_loader(false)`, + /// `encoding/xml.Decoder` with `Strict: false`, Ruby `REXML` / + /// `Nokogiri::XML`) consumes a payload carrying a `` + /// declaration that the parser then expands inside the document + /// body. `entity_expanded` is `true` when the entity body was + /// substituted into the parsed tree (the differential rule's + /// proof that XXE expansion actually fired) and `false` when the + /// parser refused the doctype / external resolution (the benign + /// `disallow-doctype-decl` control). + Xxe { + /// `true` iff the parser substituted the entity body into the + /// parsed XML output. + entity_expanded: bool, + }, } impl Default for ProbeKind { diff --git a/src/dynamic/telemetry.rs b/src/dynamic/telemetry.rs index ef06bf13..199f7d87 100644 --- a/src/dynamic/telemetry.rs +++ b/src/dynamic/telemetry.rs @@ -60,7 +60,7 @@ pub const NYX_VERSION: &str = env!("CARGO_PKG_VERSION"); /// [`crate::dynamic::corpus::CORPUS_VERSION`]; the compile-time assertion /// below + the [`corpus_version_const_matches_corpus_module`] runtime test /// jointly guard drift. -pub const CORPUS_VERSION: &str = "8"; +pub const CORPUS_VERSION: &str = "9"; /// Compile-time guard that pins [`CORPUS_VERSION`] (this module) to the /// textual form of [`crate::dynamic::corpus::CORPUS_VERSION`]. Bumping the diff --git a/tests/dynamic_fixtures/xxe/go/benign.go b/tests/dynamic_fixtures/xxe/go/benign.go new file mode 100644 index 00000000..f513b59e --- /dev/null +++ b/tests/dynamic_fixtures/xxe/go/benign.go @@ -0,0 +1,25 @@ +// Phase 05 (Track J.3) — Go XXE benign fixture. +// +// Same parser surface as `vuln.go` but `Strict` is left at the +// default `true`, so the doctype is rejected and no entity body is +// substituted. +package benign + +import ( + "bytes" + "encoding/xml" +) + +type Data struct { + XMLName xml.Name `xml:"data"` + Value string `xml:",chardata"` +} + +func Run(body string) (*Data, error) { + d := xml.NewDecoder(bytes.NewReader([]byte(body))) + out := &Data{} + if err := d.Decode(out); err != nil { + return nil, err + } + return out, nil +} diff --git a/tests/dynamic_fixtures/xxe/go/vuln.go b/tests/dynamic_fixtures/xxe/go/vuln.go new file mode 100644 index 00000000..31505251 --- /dev/null +++ b/tests/dynamic_fixtures/xxe/go/vuln.go @@ -0,0 +1,27 @@ +// Phase 05 (Track J.3) — Go XXE vuln fixture. +// +// The function builds an `encoding/xml.Decoder` against the attacker +// payload with `Strict: false` so the doctype is parsed and any +// `` in the payload is resolved and +// substituted into element values. +package vuln + +import ( + "bytes" + "encoding/xml" +) + +type Data struct { + XMLName xml.Name `xml:"data"` + Value string `xml:",chardata"` +} + +func Run(body string) (*Data, error) { + d := xml.NewDecoder(bytes.NewReader([]byte(body))) + d.Strict = false + out := &Data{} + if err := d.Decode(out); err != nil { + return nil, err + } + return out, nil +} diff --git a/tests/dynamic_fixtures/xxe/java/benign.java b/tests/dynamic_fixtures/xxe/java/benign.java new file mode 100644 index 00000000..3514cfc1 --- /dev/null +++ b/tests/dynamic_fixtures/xxe/java/benign.java @@ -0,0 +1,18 @@ +// Phase 05 (Track J.3) — Java XXE benign fixture. +// +// Same parser surface as `vuln.java` but the factory is hardened with +// `disallow-doctype-decl`, so the same payload's `` block is +// rejected at parse time and no entity body is substituted. +import java.io.ByteArrayInputStream; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import org.w3c.dom.Document; + +public class Benign { + public static Document run(byte[] payload) throws Exception { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + DocumentBuilder builder = factory.newDocumentBuilder(); + return builder.parse(new ByteArrayInputStream(payload)); + } +} diff --git a/tests/dynamic_fixtures/xxe/java/vuln.java b/tests/dynamic_fixtures/xxe/java/vuln.java new file mode 100644 index 00000000..6e11a1d9 --- /dev/null +++ b/tests/dynamic_fixtures/xxe/java/vuln.java @@ -0,0 +1,19 @@ +// Phase 05 (Track J.3) — Java XXE vuln fixture. +// +// The function feeds attacker bytes to a stock `DocumentBuilderFactory` +// without setting `disallow-doctype-decl` / `XMLConstants.FEATURE_ +// SECURE_PROCESSING`, so any `` +// declaration in the payload is resolved and its body substituted +// into the parsed tree. +import java.io.ByteArrayInputStream; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import org.w3c.dom.Document; + +public class Vuln { + public static Document run(byte[] payload) throws Exception { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + DocumentBuilder builder = factory.newDocumentBuilder(); + return builder.parse(new ByteArrayInputStream(payload)); + } +} diff --git a/tests/dynamic_fixtures/xxe/php/benign.php b/tests/dynamic_fixtures/xxe/php/benign.php new file mode 100644 index 00000000..fd8e0249 --- /dev/null +++ b/tests/dynamic_fixtures/xxe/php/benign.php @@ -0,0 +1,10 @@ +` block is rejected and no entity body is substituted. +function run(string $body) { + libxml_disable_entity_loader(true); + return simplexml_load_string($body); +} diff --git a/tests/dynamic_fixtures/xxe/php/vuln.php b/tests/dynamic_fixtures/xxe/php/vuln.php new file mode 100644 index 00000000..0abb6393 --- /dev/null +++ b/tests/dynamic_fixtures/xxe/php/vuln.php @@ -0,0 +1,11 @@ +` in the payload is +// resolved and its body substituted into the parsed document. +function run(string $body) { + libxml_disable_entity_loader(false); + return simplexml_load_string($body, "SimpleXMLElement", LIBXML_NOENT); +} diff --git a/tests/dynamic_fixtures/xxe/python/benign.py b/tests/dynamic_fixtures/xxe/python/benign.py new file mode 100644 index 00000000..f1abe8c9 --- /dev/null +++ b/tests/dynamic_fixtures/xxe/python/benign.py @@ -0,0 +1,12 @@ +"""Phase 05 (Track J.3) — Python XXE benign fixture. + +Same parser surface as `vuln.py` but the parser is configured with +`resolve_entities=False` and `no_network=True`, so the same payload's +`` block is rejected and no entity body is substituted. +""" +from lxml import etree + + +def run(body: bytes): + parser = etree.XMLParser(resolve_entities=False, no_network=True) + return etree.fromstring(body, parser=parser) diff --git a/tests/dynamic_fixtures/xxe/python/vuln.py b/tests/dynamic_fixtures/xxe/python/vuln.py new file mode 100644 index 00000000..8237a06c --- /dev/null +++ b/tests/dynamic_fixtures/xxe/python/vuln.py @@ -0,0 +1,13 @@ +"""Phase 05 (Track J.3) — Python XXE vuln fixture. + +The function pulls XML bytes off the request and feeds them straight +to `lxml.etree.XMLParser(resolve_entities=True)`, so any +`` in the payload is resolved and its +body substituted into the parsed tree. +""" +from lxml import etree + + +def run(body: bytes): + parser = etree.XMLParser(resolve_entities=True) + return etree.fromstring(body, parser=parser) diff --git a/tests/dynamic_fixtures/xxe/ruby/benign.rb b/tests/dynamic_fixtures/xxe/ruby/benign.rb new file mode 100644 index 00000000..406e76f6 --- /dev/null +++ b/tests/dynamic_fixtures/xxe/ruby/benign.rb @@ -0,0 +1,11 @@ +# Phase 05 (Track J.3) — Ruby XXE benign fixture. +# +# Same parser surface as `vuln.rb` but the document is built under +# `REXML::Document::entity_expansion_limit = 0`, so the same payload's +# `` block triggers no expansion. +require 'rexml/document' + +def run(body) + REXML::Document.entity_expansion_limit = 0 + REXML::Document.new(body) +end diff --git a/tests/dynamic_fixtures/xxe/ruby/vuln.rb b/tests/dynamic_fixtures/xxe/ruby/vuln.rb new file mode 100644 index 00000000..fea802ac --- /dev/null +++ b/tests/dynamic_fixtures/xxe/ruby/vuln.rb @@ -0,0 +1,11 @@ +# Phase 05 (Track J.3) — Ruby XXE vuln fixture. +# +# The function feeds attacker XML straight to `REXML::Document.new` +# without disabling entity expansion, so any `` in the payload is resolved and its body substituted +# into the parsed document. +require 'rexml/document' + +def run(body) + REXML::Document.new(body) +end diff --git a/tests/xxe_corpus.rs b/tests/xxe_corpus.rs new file mode 100644 index 00000000..2c5a0c7e --- /dev/null +++ b/tests/xxe_corpus.rs @@ -0,0 +1,294 @@ +//! Phase 05 (Track J.3) — XXE corpus acceptance. +//! +//! Asserts the new cap end-to-end: corpus slices register per-engine +//! vuln/benign pairs for Java / Python / PHP / Ruby / Go, the +//! lang-aware resolver pairs them inside the correct slice, the +//! per-language harness emitters splice in the synthetic XML parser + +//! entity-expansion probe + sink-hit sentinel, and the framework +//! adapters fire on the canonical sink call. +//! +//! `cargo nextest run --features dynamic --test xxe_corpus`. + +#![cfg(feature = "dynamic")] + +use nyx_scanner::dynamic::corpus::{ + audit_marker_collisions, benign_payload_for_lang, payloads_for_lang, + resolve_benign_control_lang, Oracle, +}; +use nyx_scanner::dynamic::framework::registry::adapters_for; +use nyx_scanner::dynamic::lang; +use nyx_scanner::dynamic::oracle::ProbePredicate; +use nyx_scanner::dynamic::probe::ProbeKind; +use nyx_scanner::dynamic::spec::{EntryKind, HarnessSpec, PayloadSlot}; +use nyx_scanner::labels::Cap; +use nyx_scanner::summary::FuncSummary; +use nyx_scanner::symbol::Lang; + +const LANGS: &[Lang] = &[Lang::Java, Lang::Python, Lang::Php, Lang::Ruby, Lang::Go]; + +fn make_spec(lang: Lang, entry_file: &str, entry_name: &str) -> HarnessSpec { + HarnessSpec { + finding_id: "phase05test0001".into(), + entry_file: entry_file.into(), + entry_name: entry_name.into(), + entry_kind: EntryKind::Function, + lang, + toolchain_id: "phase05".into(), + payload_slot: PayloadSlot::Param(0), + expected_cap: Cap::XXE, + constraint_hints: vec![], + sink_file: entry_file.into(), + sink_line: 1, + spec_hash: "phase05test0001".into(), + derivation: nyx_scanner::dynamic::spec::SpecDerivationStrategy::FromFlowSteps, + stubs_required: vec![], + framework: None, + } +} + +#[test] +fn corpus_registers_xxe_for_every_supported_lang() { + for lang in LANGS { + let slice = payloads_for_lang(Cap::XXE, *lang); + assert!(!slice.is_empty(), "XXE has no payloads for {lang:?}"); + let has_vuln = slice.iter().any(|p| !p.is_benign); + let has_benign = slice.iter().any(|p| p.is_benign); + assert!(has_vuln, "{lang:?} XXE missing vuln payload"); + assert!(has_benign, "{lang:?} XXE missing benign control"); + } +} + +#[test] +fn xxe_unsupported_caps_unchanged_for_other_langs() { + // Phase 05 only fills Java / Python / PHP / Ruby / Go — Rust / C + // / Cpp / JS / TS stay empty. + for lang in [ + Lang::Rust, + Lang::C, + Lang::Cpp, + Lang::JavaScript, + Lang::TypeScript, + ] { + assert!( + payloads_for_lang(Cap::XXE, lang).is_empty(), + "unexpected XXE payloads registered for {lang:?}", + ); + } +} + +#[test] +fn benign_control_resolves_within_lang_slice() { + for lang in LANGS { + let slice = payloads_for_lang(Cap::XXE, *lang); + let vuln = slice.iter().find(|p| !p.is_benign).unwrap(); + let resolved = + resolve_benign_control_lang(vuln, Cap::XXE, *lang).expect("paired control"); + assert!(resolved.is_benign); + let direct = benign_payload_for_lang(Cap::XXE, *lang).unwrap(); + assert_eq!(direct.label, resolved.label); + } +} + +#[test] +fn payload_oracle_carries_xxe_entity_expanded_predicate() { + for lang in LANGS { + let slice = payloads_for_lang(Cap::XXE, *lang); + let vuln = slice.iter().find(|p| !p.is_benign).unwrap(); + match &vuln.oracle { + Oracle::SinkProbe { predicates } => { + assert!( + predicates.iter().any(|p| matches!( + p, + ProbePredicate::XxeEntityExpanded { require_expanded: true } + )), + "{lang:?} vuln payload missing XxeEntityExpanded{{require_expanded:true}}", + ); + } + other => panic!("expected SinkProbe oracle for {lang:?}, got {other:?}"), + } + } +} + +#[test] +fn vuln_payload_bytes_contain_doctype_entity_declaration() { + // The whole differential rule rests on the vuln payload carrying + // an `` decl and the benign control NOT + // carrying one — pin both invariants so a future corpus tweak + // does not silently break the oracle. + for lang in LANGS { + let slice = payloads_for_lang(Cap::XXE, *lang); + let vuln = slice.iter().find(|p| !p.is_benign).unwrap(); + let benign = slice.iter().find(|p| p.is_benign).unwrap(); + let vuln_text = std::str::from_utf8(vuln.bytes).unwrap(); + let benign_text = std::str::from_utf8(benign.bytes).unwrap(); + assert!( + vuln_text.contains(" tree_sitter::Language { + match lang { + Lang::Java => tree_sitter::Language::from(tree_sitter_java::LANGUAGE), + Lang::Python => tree_sitter::Language::from(tree_sitter_python::LANGUAGE), + Lang::Php => tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP), + Lang::Ruby => tree_sitter::Language::from(tree_sitter_ruby::LANGUAGE), + Lang::Go => tree_sitter::Language::from(tree_sitter_go::LANGUAGE), + other => panic!("unsupported test lang {other:?}"), + } +} + +fn slug(lang: Lang) -> &'static str { + match lang { + Lang::Java => "java", + Lang::Python => "python", + Lang::Php => "php", + Lang::Ruby => "ruby", + Lang::Go => "go", + _ => "other", + } +}