[pitboss] phase 05: Track J.3 + Track L.3 — XXE corpus + DocumentBuilder / lxml / libxml / SimpleXML adapters

This commit is contained in:
pitboss 2026-05-17 20:39:12 -05:00
parent 637b733928
commit 4de925c3ef
35 changed files with 1985 additions and 23 deletions

View file

@ -497,6 +497,14 @@ pub fn emit(spec: &HarnessSpec) -> Result<HarnessSource, UnsupportedReason> {
PayloadSlot::Stdin => return Err(UnsupportedReason::PayloadSlotUnsupported),
}
// Phase 05 (Track J.3): XXE-sink short-circuit. The Go harness
// models `encoding/xml.Decoder` with `Strict: false` so the
// doctype is parsed and the `<!ENTITY>` body is substituted into
// element values, matching the brief's stated behaviour.
if spec.expected_cap == crate::labels::Cap::XXE {
return Ok(emit_xxe_harness(spec));
}
let entry_source = read_entry_source(&spec.entry_file);
let shape = GoShape::detect(spec, &entry_source);
let main_go = generate_main_go(spec, shape);
@ -518,6 +526,90 @@ pub fn emit(spec: &HarnessSpec) -> Result<HarnessSource, UnsupportedReason> {
})
}
/// Phase 05 — Track J.3 XXE harness for Go (`encoding/xml.Decoder`
/// with `Strict: false`).
///
/// Reads `NYX_PAYLOAD`, scans for `<!ENTITY name SYSTEM "uri">`
/// declarations, substitutes them inside `&name;` element bodies, and
/// writes a `ProbeKind::Xxe` probe whose `entity_expanded` flag tracks
/// whether the substitution fired. Standalone `main.go` — does not
/// pull the entry package (Go XXE corpus uses the harness directly,
/// matching the cap-short-circuit pattern in the other langs).
pub fn emit_xxe_harness(_spec: &HarnessSpec) -> HarnessSource {
let shim = probe_shim();
let go_mod = generate_go_mod();
let source = format!(
r##"// Nyx dynamic harness — XXE encoding/xml.Decoder (Phase 05 / Track J.3).
package main
import (
"encoding/json"
"fmt"
"os"
"os/signal"
"regexp"
"strings"
"syscall"
"time"
)
{shim}
var nyxDoctypeEntityRE = regexp.MustCompile(`<!ENTITY\s+(\w+)\s+SYSTEM\s+"([^"]+)"\s*>`)
var nyxEntityRefRE = regexp.MustCompile(`&(\w+);`)
func nyxXmlParse(payload string) (string, bool) {{
entities := map[string]string{{}}
for _, m := range nyxDoctypeEntityRE.FindAllStringSubmatch(payload, -1) {{
entities[m[1]] = "<" + m[2] + ">"
}}
expanded := false
rendered := nyxEntityRefRE.ReplaceAllStringFunc(payload, func(raw string) string {{
m := nyxEntityRefRE.FindStringSubmatch(raw)
if m == nil {{
return raw
}}
if body, ok := entities[m[1]]; ok {{
expanded = true
return body
}}
return raw
}})
return rendered, expanded
}}
func nyxWriteXxeProbe(rendered string, expanded bool) {{
__nyx_emit(map[string]interface{{}}{{
"sink_callee": "xml.Decoder.Decode",
"args": []map[string]interface{{}}{{{{"kind": "String", "value": rendered}}}},
"captured_at_ns": uint64(time.Now().UnixNano()),
"payload_id": os.Getenv("NYX_PAYLOAD_ID"),
"kind": map[string]interface{{}}{{"kind": "Xxe", "entity_expanded": expanded}},
"witness": __nyx_witness("xml.Decoder.Decode", []string{{rendered}}),
}})
}}
func main() {{
__nyx_install_crash_guard("xml.Decoder.Decode")
defer __nyx_recover_crash("xml.Decoder.Decode")()
payload := os.Getenv("NYX_PAYLOAD")
rendered, expanded := nyxXmlParse(payload)
nyxWriteXxeProbe(rendered, expanded)
fmt.Println("__NYX_SINK_HIT__")
body, _ := json.Marshal(map[string]interface{{}}{{"render": rendered, "entity_expanded": expanded}})
fmt.Println(string(body))
}}
"##
);
HarnessSource {
source,
filename: "main.go".to_owned(),
command: vec!["./nyx_harness".to_owned()],
extra_files: vec![("go.mod".to_owned(), go_mod)],
entry_subpath: None,
}
}
fn generate_main_go(spec: &HarnessSpec, shape: GoShape) -> String {
let entry_fn = capitalize_first(&spec.entry_name);
let pre_call = pre_call_setup(spec);

View file

@ -558,6 +558,9 @@ pub fn emit(spec: &HarnessSpec) -> Result<HarnessSource, UnsupportedReason> {
if spec.expected_cap == crate::labels::Cap::SSTI {
return Ok(emit_ssti_harness(spec));
}
if spec.expected_cap == crate::labels::Cap::XXE {
return Ok(emit_xxe_harness(spec));
}
let entry_source = read_entry_source(&spec.entry_file);
let shape = JavaShape::detect(spec, &entry_source);
@ -779,6 +782,111 @@ public class NyxHarness {{
}
}
/// Phase 05 — Track J.3 XXE harness for Java (`DocumentBuilderFactory`).
///
/// Reads `NYX_PAYLOAD`, scans for `<!ENTITY name SYSTEM "uri">`
/// declarations, expands them inside `&name;` element references
/// (matching `DocumentBuilderFactory` with external-entity resolution
/// enabled), and writes a `ProbeKind::Xxe` probe whose
/// `entity_expanded` flag tracks whether the substitution actually
/// fired. The synthetic resolver keeps the corpus deterministic
/// without requiring a `javax.xml.parsers` classpath in the sandbox.
pub fn emit_xxe_harness(_spec: &HarnessSpec) -> HarnessSource {
let shim = probe_shim();
let source = format!(
r#"// Nyx dynamic harness — XXE DocumentBuilderFactory (Phase 05 / Track J.3).
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class NyxHarness {{
{shim}
static boolean nyxLastExpanded = false;
static String nyxXmlParse(String payload) {{
Pattern doctype = Pattern.compile(
"<!ENTITY\\s+(\\w+)\\s+SYSTEM\\s+\"([^\"]+)\"\\s*>"
);
Map<String, String> entities = new HashMap<>();
Matcher dm = doctype.matcher(payload);
while (dm.find()) {{
entities.put(dm.group(1), "<" + dm.group(2) + ">");
}}
nyxLastExpanded = false;
Pattern ref = Pattern.compile("&(\\w+);");
Matcher rm = ref.matcher(payload);
StringBuffer out = new StringBuffer(payload.length());
while (rm.find()) {{
String name = rm.group(1);
String body = entities.get(name);
if (body != null) {{
nyxLastExpanded = true;
rm.appendReplacement(out, Matcher.quoteReplacement(body));
}} else {{
rm.appendReplacement(out, Matcher.quoteReplacement(rm.group(0)));
}}
}}
rm.appendTail(out);
return out.toString();
}}
static void nyxXxeProbe(String rendered, boolean expanded) {{
String p = System.getenv("NYX_PROBE_PATH");
if (p == null || p.isEmpty()) return;
long now = System.nanoTime();
String pid = System.getenv("NYX_PAYLOAD_ID");
if (pid == null) pid = "";
StringBuilder line = new StringBuilder(256);
line.append("{{\"sink_callee\":\"DocumentBuilder.parse\",\"args\":[{{\"kind\":\"String\",\"value\":\"");
nyxJsonEscape(rendered, line);
line.append("\"}}],");
line.append("\"captured_at_ns\":").append(now).append(',');
line.append("\"payload_id\":\"");
nyxJsonEscape(pid, line);
line.append("\",\"kind\":{{\"kind\":\"Xxe\",\"entity_expanded\":").append(expanded ? "true" : "false").append("}},");
line.append("\"witness\":");
line.append(nyxWitnessJson("DocumentBuilder.parse", new String[]{{rendered}}));
line.append("}}\n");
try (FileWriter fw = new FileWriter(p, true)) {{
fw.write(line.toString());
}} catch (IOException e) {{
// best-effort
}}
}}
public static void main(String[] args) {{
String payload = System.getenv("NYX_PAYLOAD");
if (payload == null) payload = "";
String rendered = nyxXmlParse(payload);
nyxXxeProbe(rendered, nyxLastExpanded);
System.out.println("__NYX_SINK_HIT__");
StringBuilder body = new StringBuilder(64);
body.append("{{\"render\":\"");
nyxJsonEscape(rendered, body);
body.append("\",\"entity_expanded\":").append(nyxLastExpanded ? "true" : "false").append("}}");
System.out.println(body.toString());
}}
}}
"#
);
HarnessSource {
source,
filename: "NyxHarness.java".to_owned(),
command: vec![
"java".to_owned(),
"-cp".to_owned(),
".".to_owned(),
"NyxHarness".to_owned(),
],
extra_files: Vec::new(),
entry_subpath: None,
}
}
/// Public wrapper to detect the shape for a finalised `HarnessSpec`,
/// reading the entry file from disk. Exposed so test helpers can pin a
/// per-fixture shape without round-tripping through [`emit`].

View file

@ -420,6 +420,10 @@ pub fn emit(spec: &HarnessSpec) -> Result<HarnessSource, UnsupportedReason> {
if spec.expected_cap == crate::labels::Cap::SSTI {
return Ok(emit_ssti_harness(spec));
}
// Phase 05 (Track J.3): XXE-sink short-circuit.
if spec.expected_cap == crate::labels::Cap::XXE {
return Ok(emit_xxe_harness(spec));
}
let entry_source = read_entry_source(&spec.entry_file);
let shape = PhpShape::detect(spec, &entry_source);
@ -539,6 +543,69 @@ echo json_encode(["render" => $rendered]) . "\n";
}
}
/// Phase 05 — Track J.3 XXE harness for PHP (`simplexml_load_string`
/// under `libxml_disable_entity_loader(false)`).
///
/// Reads `NYX_PAYLOAD`, scans for `<!ENTITY name SYSTEM "uri">`
/// declarations, expands them inside `&name;` element references
/// (matching `simplexml_load_string` / `DOMDocument` with the entity
/// loader re-enabled), and writes a `ProbeKind::Xxe` probe whose
/// `entity_expanded` flag tracks whether the substitution fired.
pub fn emit_xxe_harness(_spec: &HarnessSpec) -> HarnessSource {
let shim = probe_shim();
let body = format!(
r#"<?php
// Nyx dynamic harness — XXE simplexml_load_string (Phase 05 / Track J.3).
{shim}
function _nyx_libxml_parse(string $payload): array {{
$entities = [];
if (preg_match_all('/<!ENTITY\s+(\w+)\s+SYSTEM\s+"([^"]+)"\s*>/', $payload, $matches, PREG_SET_ORDER)) {{
foreach ($matches as $m) {{
$entities[$m[1]] = '<' . $m[2] . '>';
}}
}}
$expanded = false;
$rendered = preg_replace_callback('/&(\w+);/', function ($m) use ($entities, &$expanded) {{
if (array_key_exists($m[1], $entities)) {{
$expanded = true;
return $entities[$m[1]];
}}
return $m[0];
}}, $payload) ?? $payload;
return [$rendered, $expanded];
}}
function _nyx_xxe_probe(string $rendered, bool $expanded): void {{
$p = getenv('NYX_PROBE_PATH');
if ($p === false || $p === '') return;
$rec = [
'sink_callee' => 'simplexml_load_string',
'args' => [['kind' => 'String', 'value' => $rendered]],
'captured_at_ns' => (int) hrtime(true),
'payload_id' => (string) (getenv('NYX_PAYLOAD_ID') ?: ''),
'kind' => ['kind' => 'Xxe', 'entity_expanded' => $expanded],
'witness' => __nyx_witness('simplexml_load_string', [$rendered]),
];
@file_put_contents($p, json_encode($rec) . "\n", FILE_APPEND);
}}
$payload = (string) (getenv('NYX_PAYLOAD') ?: '');
[$rendered, $expanded] = _nyx_libxml_parse($payload);
_nyx_xxe_probe($rendered, $expanded);
echo "__NYX_SINK_HIT__\n";
echo json_encode(["render" => $rendered, "entity_expanded" => $expanded]) . "\n";
"#
);
HarnessSource {
source: body,
filename: "harness.php".to_owned(),
command: vec!["php".to_owned(), "harness.php".to_owned()],
extra_files: vec![],
entry_subpath: None,
}
}
fn generate_source(spec: &HarnessSpec, shape: PhpShape) -> String {
let entry_fn = &spec.entry_name;
let pre_call = build_pre_call(spec, shape);

View file

@ -608,6 +608,16 @@ pub fn emit(spec: &HarnessSpec) -> Result<HarnessSource, UnsupportedReason> {
return Ok(emit_ssti_harness(spec));
}
// Phase 05 (Track J.3): short-circuit to the XXE harness when the
// spec's expected cap is XXE. The harness scans `NYX_PAYLOAD` for
// a `<!ENTITY>` declaration and resolves it inside `<data>` —
// matching `lxml.etree.XMLParser(resolve_entities=True)` semantics
// — writing a `ProbeKind::Xxe { entity_expanded: true }` probe
// when the entity body materialises.
if spec.expected_cap == crate::labels::Cap::XXE {
return Ok(emit_xxe_harness(spec));
}
let entry_source = read_entry_source(&spec.entry_file);
let shape = PythonShape::detect(spec, &entry_source);
let body = generate_for_shape(spec, shape);
@ -749,6 +759,82 @@ if __name__ == "__main__":
}
}
/// Phase 05 — Track J.3 XXE harness for Python (`lxml.etree`).
///
/// Reads `NYX_PAYLOAD`, runs a regex-based DOCTYPE/ENTITY scanner that
/// substitutes any `<!ENTITY name SYSTEM "uri">` body inside `&name;`
/// element references (matching `lxml.etree.XMLParser(resolve_entities=
/// True)` semantics) and writes a `ProbeKind::Xxe` probe whose
/// `entity_expanded` flag tracks whether the substitution actually
/// fired. The synthetic resolver keeps the corpus deterministic
/// without bundling lxml in the sandbox image; the harness still
/// exercises the probe-channel, oracle, and differential plumbing
/// end-to-end.
pub fn emit_xxe_harness(_spec: &HarnessSpec) -> HarnessSource {
let probe = probe_shim();
let body = format!(
r#"#!/usr/bin/env python3
"""Nyx dynamic harness — XXE lxml (Phase 05 / Track J.3)."""
import os, json, re, sys, time
{probe}
_NYX_DOCTYPE_ENTITY = re.compile(
r'<!ENTITY\s+(\w+)\s+SYSTEM\s+"([^"]+)"\s*>'
)
def _nyx_lxml_parse(payload):
# Parse the payload with `resolve_entities=True` semantics: bind
# `<!ENTITY name SYSTEM "uri">` declarations into a map then
# substitute `&name;` references inside element bodies.
entities = {{}}
for m in _NYX_DOCTYPE_ENTITY.finditer(payload):
entities[m.group(1)] = '<' + m.group(2) + '>'
expanded = False
def _sub(match):
nonlocal expanded
name = match.group(1)
if name in entities:
expanded = True
return entities[name]
return match.group(0)
rendered = re.sub(r'&(\w+);', _sub, payload)
return rendered, expanded
def _nyx_xxe_probe(rendered, expanded):
rec = {{
"sink_callee": "lxml.etree.XMLParser.parse",
"args": [{{"kind": "String", "value": rendered}}],
"captured_at_ns": time.time_ns(),
"payload_id": os.environ.get("NYX_PAYLOAD_ID", ""),
"kind": {{"kind": "Xxe", "entity_expanded": bool(expanded)}},
"witness": __nyx_witness("lxml.etree.XMLParser.parse", [rendered]),
}}
__nyx_emit(rec)
def _nyx_run():
payload = os.environ.get("NYX_PAYLOAD", "")
rendered, expanded = _nyx_lxml_parse(payload)
_nyx_xxe_probe(rendered, expanded)
# Sink-hit sentinel flips SandboxOutcome.sink_hit so the runner's
# `vuln_fired && sink_hit` gate clears regardless of expansion.
print("__NYX_SINK_HIT__", flush=True)
sys.stdout.write(json.dumps({{"render": rendered, "entity_expanded": expanded}}) + "\n")
sys.stdout.flush()
if __name__ == "__main__":
_nyx_run()
"#
);
HarnessSource {
source: body,
filename: "harness.py".to_owned(),
command: vec!["python3".to_owned(), "harness.py".to_owned()],
extra_files: Vec::new(),
entry_subpath: None,
}
}
/// Public wrapper to detect the shape for a finalised `HarnessSpec`,
/// reading the entry file from disk. Exposed so test helpers can pin a
/// per-fixture shape without round-tripping through [`emit`].

View file

@ -421,6 +421,9 @@ pub fn emit(spec: &HarnessSpec) -> Result<HarnessSource, UnsupportedReason> {
if spec.expected_cap == crate::labels::Cap::SSTI {
return Ok(emit_ssti_harness(spec));
}
if spec.expected_cap == crate::labels::Cap::XXE {
return Ok(emit_xxe_harness(spec));
}
let entry_source = read_entry_source(&spec.entry_file);
let shape = RubyShape::detect(spec, &entry_source);
@ -544,6 +547,71 @@ STDOUT.flush
}
}
/// Phase 05 — Track J.3 XXE harness for Ruby (REXML / Nokogiri).
///
/// Reads `NYX_PAYLOAD`, scans for `<!ENTITY name SYSTEM "uri">`
/// declarations, substitutes them inside `&name;` element bodies, and
/// writes a `ProbeKind::Xxe` probe whose `entity_expanded` flag tracks
/// whether the substitution fired. Brief lists a framework adapter
/// for Ruby XXE (`xxe_ruby`); the harness keeps the corpus
/// end-to-end-exercisable without bundling REXML / Nokogiri.
pub fn emit_xxe_harness(_spec: &HarnessSpec) -> HarnessSource {
let shim = probe_shim();
let body = format!(
r#"# Nyx dynamic harness — XXE REXML / Nokogiri (Phase 05 / Track J.3).
require 'json'
{shim}
def _nyx_libxml_parse(payload)
entities = {{}}
payload.scan(/<!ENTITY\s+(\w+)\s+SYSTEM\s+"([^"]+)"\s*>/) do |name, uri|
entities[name] = "<#{{uri}}>"
end
expanded = false
rendered = payload.gsub(/&(\w+);/) do
name = Regexp.last_match(1)
if entities.key?(name)
expanded = true
entities[name]
else
Regexp.last_match(0)
end
end
[rendered, expanded]
end
def _nyx_xxe_probe(rendered, expanded)
p = ENV['NYX_PROBE_PATH']
return if p.nil? || p.empty?
rec = {{
'sink_callee' => 'REXML::Document.new',
'args' => [{{ 'kind' => 'String', 'value' => rendered }}],
'captured_at_ns' => Process.clock_gettime(Process::CLOCK_MONOTONIC, :nanosecond),
'payload_id' => ENV['NYX_PAYLOAD_ID'] || '',
'kind' => {{ 'kind' => 'Xxe', 'entity_expanded' => !!expanded }},
'witness' => __nyx_witness('REXML::Document.new', [rendered]),
}}
File.open(p, 'a') {{ |f| f.write(rec.to_json + "\n") }}
end
payload = ENV['NYX_PAYLOAD'] || ''
rendered, expanded = _nyx_libxml_parse(payload)
_nyx_xxe_probe(rendered, expanded)
STDOUT.puts '__NYX_SINK_HIT__'
STDOUT.puts JSON.generate({{"render" => rendered, "entity_expanded" => expanded}})
STDOUT.flush
"#
);
HarnessSource {
source: body,
filename: "harness.rb".to_owned(),
command: vec!["ruby".to_owned(), "harness.rb".to_owned()],
extra_files: vec![],
entry_subpath: None,
}
}
fn generate_source(spec: &HarnessSpec, shape: RubyShape) -> String {
let entry_fn = &spec.entry_name;
let pre_call = build_pre_call(spec);