[pitboss] phase 03: Track J.1 + Track L.1 — DESERIALIZE corpus + Java/Python/PHP/Ruby adapters

This commit is contained in:
pitboss 2026-05-17 16:37:20 -05:00
parent 01fcaab310
commit 9dc60b51c0
33 changed files with 1625 additions and 53 deletions

View file

@ -0,0 +1,97 @@
//! Java [`super::super::FrameworkAdapter`] matching deserialization sinks.
//!
//! Fires when the function body invokes `ObjectInputStream.readObject`
//! or `XMLDecoder.readObject` (matched by the last segment of the
//! callee name — the call graph normaliser drops the receiver).
use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding};
use crate::evidence::EntryKind;
use crate::summary::FuncSummary;
use crate::symbol::Lang;
pub struct JavaDeserializeAdapter;
const ADAPTER_NAME: &str = "java-deserialize";
fn callee_is_java_deserialize(name: &str) -> bool {
let last = name.rsplit_once('.').map(|(_, s)| s).unwrap_or(name);
matches!(last, "readObject" | "fromXML" | "deserialize")
}
impl FrameworkAdapter for JavaDeserializeAdapter {
fn name(&self) -> &'static str {
ADAPTER_NAME
}
fn lang(&self) -> Lang {
Lang::Java
}
fn detect(
&self,
summary: &FuncSummary,
_ast: tree_sitter::Node<'_>,
file_bytes: &[u8],
) -> Option<FrameworkBinding> {
let matches_call = super::any_callee_matches(summary, callee_is_java_deserialize);
let matches_source = file_bytes
.windows(b"ObjectInputStream".len())
.any(|w| w == b"ObjectInputStream")
|| file_bytes
.windows(b"XMLDecoder".len())
.any(|w| w == b"XMLDecoder");
if matches_call || matches_source {
Some(FrameworkBinding {
adapter: ADAPTER_NAME.to_owned(),
kind: EntryKind::Function,
route: None,
request_params: Vec::new(),
response_writer: None,
middleware: Vec::new(),
})
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_java(src: &[u8]) -> tree_sitter::Tree {
let mut parser = tree_sitter::Parser::new();
let lang = tree_sitter::Language::from(tree_sitter_java::LANGUAGE);
parser.set_language(&lang).unwrap();
parser.parse(src, None).unwrap()
}
#[test]
fn fires_when_source_imports_object_input_stream() {
let src: &[u8] = b"import java.io.ObjectInputStream;\npublic class V { public static void run(byte[] b) {} }\n";
let tree = parse_java(src);
let summary = FuncSummary {
name: "run".into(),
..Default::default()
};
let binding = JavaDeserializeAdapter
.detect(&summary, tree.root_node(), src)
.expect("must fire on ObjectInputStream source");
assert_eq!(binding.adapter, ADAPTER_NAME);
assert_eq!(binding.kind, EntryKind::Function);
}
#[test]
fn skips_plain_function() {
let src: &[u8] =
b"public class V { public static void run(String b) { System.out.println(b); } }\n";
let tree = parse_java(src);
let summary = FuncSummary {
name: "run".into(),
..Default::default()
};
assert!(JavaDeserializeAdapter
.detect(&summary, tree.root_node(), src)
.is_none());
}
}

View file

@ -0,0 +1,30 @@
//! Concrete [`super::FrameworkAdapter`] implementations.
//!
//! Phase 03 (Track J.1) lands the first four adapters — one per
//! language carrying the new `Cap::DESERIALIZE` corpus. Each adapter
//! detects the language's canonical deserialization sink inside a
//! function body and stamps a [`super::FrameworkBinding`] with
//! [`crate::evidence::EntryKind::Function`]. Track L.1+ will register
//! the route / framework adapters; the per-cap sink adapters live here
//! so the per-language verticals can ship independently.
pub mod java_deserialize;
pub mod php_unserialize;
pub mod python_pickle;
pub mod ruby_marshal;
pub use java_deserialize::JavaDeserializeAdapter;
pub use php_unserialize::PhpUnserializeAdapter;
pub use python_pickle::PythonPickleAdapter;
pub use ruby_marshal::RubyMarshalAdapter;
/// True when any callee in `summary.callees` matches `predicate`.
fn any_callee_matches(
summary: &crate::summary::FuncSummary,
predicate: impl Fn(&str) -> bool,
) -> bool {
summary
.callees
.iter()
.any(|c| predicate(c.name.as_str()))
}

View file

@ -0,0 +1,88 @@
//! PHP [`super::super::FrameworkAdapter`] matching `unserialize` sinks.
use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding};
use crate::evidence::EntryKind;
use crate::summary::FuncSummary;
use crate::symbol::Lang;
pub struct PhpUnserializeAdapter;
const ADAPTER_NAME: &str = "php-unserialize";
fn callee_is_php_deserialize(name: &str) -> bool {
let last = name.rsplit_once('\\').map(|(_, s)| s).unwrap_or(name);
let last = last.rsplit_once("::").map(|(_, s)| s).unwrap_or(last);
matches!(last, "unserialize")
}
impl FrameworkAdapter for PhpUnserializeAdapter {
fn name(&self) -> &'static str {
ADAPTER_NAME
}
fn lang(&self) -> Lang {
Lang::Php
}
fn detect(
&self,
summary: &FuncSummary,
_ast: tree_sitter::Node<'_>,
file_bytes: &[u8],
) -> Option<FrameworkBinding> {
let matches_call = super::any_callee_matches(summary, callee_is_php_deserialize);
let matches_source = file_bytes
.windows(b"unserialize".len())
.any(|w| w == b"unserialize");
if matches_call || matches_source {
Some(FrameworkBinding {
adapter: ADAPTER_NAME.to_owned(),
kind: EntryKind::Function,
route: None,
request_params: Vec::new(),
response_writer: None,
middleware: Vec::new(),
})
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_php(src: &[u8]) -> tree_sitter::Tree {
let mut parser = tree_sitter::Parser::new();
let lang = tree_sitter::Language::from(tree_sitter_php::LANGUAGE_PHP);
parser.set_language(&lang).unwrap();
parser.parse(src, None).unwrap()
}
#[test]
fn fires_when_source_calls_unserialize() {
let src: &[u8] = b"<?php\nfunction run($blob) { return unserialize($blob); }\n";
let tree = parse_php(src);
let summary = FuncSummary {
name: "run".into(),
..Default::default()
};
assert!(PhpUnserializeAdapter
.detect(&summary, tree.root_node(), src)
.is_some());
}
#[test]
fn skips_plain_function() {
let src: &[u8] = b"<?php\nfunction run($x) { return strtoupper($x); }\n";
let tree = parse_php(src);
let summary = FuncSummary {
name: "run".into(),
..Default::default()
};
assert!(PhpUnserializeAdapter
.detect(&summary, tree.root_node(), src)
.is_none());
}
}

View file

@ -0,0 +1,97 @@
//! Python [`super::super::FrameworkAdapter`] matching pickle / yaml
//! deserialization sinks.
use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding};
use crate::evidence::EntryKind;
use crate::summary::FuncSummary;
use crate::symbol::Lang;
pub struct PythonPickleAdapter;
const ADAPTER_NAME: &str = "python-pickle";
fn callee_is_python_deserialize(name: &str) -> bool {
let last = name.rsplit_once('.').map(|(_, s)| s).unwrap_or(name);
matches!(
last,
"loads" | "load" | "unsafe_load" | "Unpickler" | "find_class"
)
}
impl FrameworkAdapter for PythonPickleAdapter {
fn name(&self) -> &'static str {
ADAPTER_NAME
}
fn lang(&self) -> Lang {
Lang::Python
}
fn detect(
&self,
summary: &FuncSummary,
_ast: tree_sitter::Node<'_>,
file_bytes: &[u8],
) -> Option<FrameworkBinding> {
let matches_call = super::any_callee_matches(summary, callee_is_python_deserialize);
let matches_source = file_bytes
.windows(b"pickle".len())
.any(|w| w == b"pickle")
|| file_bytes
.windows(b"yaml.unsafe_load".len())
.any(|w| w == b"yaml.unsafe_load")
|| file_bytes
.windows(b"yaml.load".len())
.any(|w| w == b"yaml.load");
if matches_call || matches_source {
Some(FrameworkBinding {
adapter: ADAPTER_NAME.to_owned(),
kind: EntryKind::Function,
route: None,
request_params: Vec::new(),
response_writer: None,
middleware: Vec::new(),
})
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_python(src: &[u8]) -> tree_sitter::Tree {
let mut parser = tree_sitter::Parser::new();
let lang = tree_sitter::Language::from(tree_sitter_python::LANGUAGE);
parser.set_language(&lang).unwrap();
parser.parse(src, None).unwrap()
}
#[test]
fn fires_when_source_imports_pickle() {
let src: &[u8] = b"import pickle\n\ndef run(blob):\n return pickle.loads(blob)\n";
let tree = parse_python(src);
let summary = FuncSummary {
name: "run".into(),
..Default::default()
};
assert!(PythonPickleAdapter
.detect(&summary, tree.root_node(), src)
.is_some());
}
#[test]
fn skips_plain_function() {
let src: &[u8] = b"def run(x):\n return x + 1\n";
let tree = parse_python(src);
let summary = FuncSummary {
name: "run".into(),
..Default::default()
};
assert!(PythonPickleAdapter
.detect(&summary, tree.root_node(), src)
.is_none());
}
}

View file

@ -0,0 +1,99 @@
//! Ruby [`super::super::FrameworkAdapter`] matching `Marshal.load` /
//! `YAML.load` deserialization sinks.
use crate::dynamic::framework::{FrameworkAdapter, FrameworkBinding};
use crate::evidence::EntryKind;
use crate::summary::FuncSummary;
use crate::symbol::Lang;
pub struct RubyMarshalAdapter;
const ADAPTER_NAME: &str = "ruby-marshal";
fn callee_is_ruby_deserialize(name: &str) -> bool {
let last = name.rsplit_once('.').map(|(_, s)| s).unwrap_or(name);
let last = last.rsplit_once("::").map(|(_, s)| s).unwrap_or(last);
matches!(last, "load" | "restore" | "unsafe_load" | "load_documents")
&& (name.contains("Marshal") || name.contains("YAML"))
}
impl FrameworkAdapter for RubyMarshalAdapter {
fn name(&self) -> &'static str {
ADAPTER_NAME
}
fn lang(&self) -> Lang {
Lang::Ruby
}
fn detect(
&self,
summary: &FuncSummary,
_ast: tree_sitter::Node<'_>,
file_bytes: &[u8],
) -> Option<FrameworkBinding> {
let matches_call = super::any_callee_matches(summary, callee_is_ruby_deserialize);
let matches_source = file_bytes
.windows(b"Marshal.load".len())
.any(|w| w == b"Marshal.load")
|| file_bytes
.windows(b"Marshal.restore".len())
.any(|w| w == b"Marshal.restore")
|| file_bytes
.windows(b"YAML.load".len())
.any(|w| w == b"YAML.load")
|| file_bytes
.windows(b"YAML.unsafe_load".len())
.any(|w| w == b"YAML.unsafe_load");
if matches_call || matches_source {
Some(FrameworkBinding {
adapter: ADAPTER_NAME.to_owned(),
kind: EntryKind::Function,
route: None,
request_params: Vec::new(),
response_writer: None,
middleware: Vec::new(),
})
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parse_ruby(src: &[u8]) -> tree_sitter::Tree {
let mut parser = tree_sitter::Parser::new();
let lang = tree_sitter::Language::from(tree_sitter_ruby::LANGUAGE);
parser.set_language(&lang).unwrap();
parser.parse(src, None).unwrap()
}
#[test]
fn fires_when_source_calls_marshal_load() {
let src: &[u8] = b"def run(blob)\n Marshal.load(blob)\nend\n";
let tree = parse_ruby(src);
let summary = FuncSummary {
name: "run".into(),
..Default::default()
};
assert!(RubyMarshalAdapter
.detect(&summary, tree.root_node(), src)
.is_some());
}
#[test]
fn skips_plain_function() {
let src: &[u8] = b"def run(x)\n x + 1\nend\n";
let tree = parse_ruby(src);
let summary = FuncSummary {
name: "run".into(),
..Default::default()
};
assert!(RubyMarshalAdapter
.detect(&summary, tree.root_node(), src)
.is_none());
}
}

View file

@ -14,6 +14,7 @@
//! phase that adds a new adapter cannot silently re-order an existing
//! match.
pub mod adapters;
pub mod registry;
use crate::evidence::EntryKind;
@ -213,28 +214,32 @@ mod tests {
}
#[test]
fn registry_is_empty_for_every_lang_phase_01() {
// Regression guard: Phase 01 ships the trait + dispatch
// machinery but registers zero adapters. Subsequent Track-L
// phases register concrete adapters per language; this test
// documents the starting baseline so accidental re-ordering
// is caught by `tests/determinism_audit.rs`.
fn registry_baseline_after_phase_03() {
// Phase 03 (Track J.1) registers one deserialize-sink adapter
// per supported language: Java, Python, PHP, Ruby. The other
// languages still carry the Phase-01 empty baseline.
for lang in [Lang::Java, Lang::Python, Lang::Php, Lang::Ruby] {
let registered = registry::adapters_for(lang);
assert_eq!(
registered.len(),
1,
"{:?} must have exactly the J.1 deserialize adapter registered",
lang,
);
assert_eq!(registered[0].lang(), lang);
}
for lang in [
Lang::Rust,
Lang::C,
Lang::Cpp,
Lang::Java,
Lang::Go,
Lang::Php,
Lang::Python,
Lang::Ruby,
Lang::TypeScript,
Lang::JavaScript,
] {
assert!(
registry::adapters_for(lang).is_empty(),
"{:?} starts with zero registered adapters",
lang
"{:?} should still have zero adapters before its Track-L phase",
lang,
);
}
}

View file

@ -38,16 +38,19 @@ pub fn adapters_for(lang: Lang) -> &'static [&'static dyn FrameworkAdapter] {
}
}
// All slices intentionally empty in Phase 01. Later Track-L phases
// register concrete adapters (Flask, Spring, axum, Express, …) into
// the appropriate language slice.
// Phase 03 (Track J.1) registers per-language deserialize-sink
// adapters into the matching language slice. Other Track-L verticals
// add route / framework adapters as they land.
static RUST: &[&dyn FrameworkAdapter] = &[];
static C: &[&dyn FrameworkAdapter] = &[];
static CPP: &[&dyn FrameworkAdapter] = &[];
static JAVA: &[&dyn FrameworkAdapter] = &[];
static JAVA: &[&dyn FrameworkAdapter] =
&[&super::adapters::JavaDeserializeAdapter];
static GO: &[&dyn FrameworkAdapter] = &[];
static PHP: &[&dyn FrameworkAdapter] = &[];
static PYTHON: &[&dyn FrameworkAdapter] = &[];
static RUBY: &[&dyn FrameworkAdapter] = &[];
static PHP: &[&dyn FrameworkAdapter] = &[&super::adapters::PhpUnserializeAdapter];
static PYTHON: &[&dyn FrameworkAdapter] =
&[&super::adapters::PythonPickleAdapter];
static RUBY: &[&dyn FrameworkAdapter] =
&[&super::adapters::RubyMarshalAdapter];
static TYPESCRIPT: &[&dyn FrameworkAdapter] = &[];
static JAVASCRIPT: &[&dyn FrameworkAdapter] = &[];