[pitboss] phase 09: Track D.1 + D.2 — Project dependency capture + workdir staging

This commit is contained in:
pitboss 2026-05-14 13:40:47 -05:00
parent a7fbc37c21
commit 2f01894353
16 changed files with 2009 additions and 0 deletions

1112
src/dynamic/environment.rs Normal file

File diff suppressed because it is too large Load diff

View file

@ -24,6 +24,7 @@
//!
//! Build container: `nyx-build-go:{toolchain_id}` (deferred; §19.1).
use crate::dynamic::environment::{Environment, RuntimeArtifacts};
use crate::dynamic::lang::{HarnessSource, LangEmitter};
use crate::dynamic::spec::{EntryKind, HarnessSpec, PayloadSlot};
use crate::evidence::UnsupportedReason;
@ -51,6 +52,59 @@ impl LangEmitter for GoEmitter {
"go emitter supports {SUPPORTED:?}; this finding's enclosing context is `EntryKind::{attempted}` — Track B will add net/http, gin, flag.Parse shapes in phase 15"
)
}
fn materialize_runtime(&self, env: &Environment) -> RuntimeArtifacts {
materialize_go(env)
}
}
/// Phase 09 — Track D.2: synthesise a `go.mod` listing every captured
/// third-party import path. Standard-library imports are skipped via
/// [`is_go_stdlib`].
pub fn materialize_go(env: &Environment) -> RuntimeArtifacts {
let mut artifacts = RuntimeArtifacts::new();
let go_version = env
.toolchain
.version_string
.split('.')
.take(2)
.collect::<Vec<_>>()
.join(".");
let go_version = if go_version.is_empty() {
"1.22".to_owned()
} else {
go_version
};
let mut deps: Vec<String> = Vec::new();
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
for d in &env.direct_deps {
if is_go_stdlib(d) {
continue;
}
if seen.insert(d.clone()) {
deps.push(d.clone());
}
}
deps.sort_unstable();
let mut body = String::with_capacity(128);
body.push_str("module nyx_harness\n\n");
body.push_str(&format!("go {go_version}\n"));
if !deps.is_empty() {
body.push_str("\nrequire (\n");
for d in &deps {
body.push_str(&format!("\t{d} latest\n"));
}
body.push_str(")\n");
}
artifacts.push("go.mod", body);
artifacts
}
fn is_go_stdlib(path: &str) -> bool {
// Anything without a "." in the first path segment is a stdlib pkg.
let first = path.split('/').next().unwrap_or(path);
!first.contains('.')
}
/// Source of the `__nyx_probe` shim for the Go harness (Phase 06 —

View file

@ -26,6 +26,7 @@
//!
//! Build container: `nyx-build-java:{toolchain_id}` (deferred; §19.1).
use crate::dynamic::environment::{Environment, RuntimeArtifacts};
use crate::dynamic::lang::{HarnessSource, LangEmitter};
use crate::dynamic::spec::{EntryKind, HarnessSpec, PayloadSlot};
use crate::evidence::UnsupportedReason;
@ -53,6 +54,79 @@ impl LangEmitter for JavaEmitter {
"java emitter supports {SUPPORTED:?}; this finding's enclosing context is `EntryKind::{attempted}` — Track B will add servlet / Spring / Quarkus shapes in phase 14"
)
}
fn materialize_runtime(&self, env: &Environment) -> RuntimeArtifacts {
materialize_java(env)
}
}
/// Phase 09 — Track D.2: synthesise a minimal `pom.xml` that pins the
/// Java toolchain and lists the direct dep top-level packages as
/// dependencies. Each direct dep maps to `<groupId>{pkg}</groupId>`
/// with an artifact id matching the package name; this is a best-effort
/// stub and Phase 10 corpus expansion will introduce a known-good
/// group→artifact registry.
pub fn materialize_java(env: &Environment) -> RuntimeArtifacts {
let mut artifacts = RuntimeArtifacts::new();
let java_version = env
.toolchain
.version_string
.split('.')
.next()
.unwrap_or("21")
.to_owned();
let mut deps: Vec<String> = Vec::new();
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
for d in &env.direct_deps {
if is_java_stdlib(d) {
continue;
}
if seen.insert(d.clone()) {
deps.push(d.clone());
}
}
deps.sort_unstable();
let mut body = String::with_capacity(256);
body.push_str("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
body.push_str("<project xmlns=\"http://maven.apache.org/POM/4.0.0\">\n");
body.push_str(" <modelVersion>4.0.0</modelVersion>\n");
body.push_str(" <groupId>nyx</groupId>\n");
body.push_str(" <artifactId>harness</artifactId>\n");
body.push_str(" <version>0.0.1</version>\n");
body.push_str(" <properties>\n");
body.push_str(&format!(
" <maven.compiler.source>{java_version}</maven.compiler.source>\n"
));
body.push_str(&format!(
" <maven.compiler.target>{java_version}</maven.compiler.target>\n"
));
body.push_str(" </properties>\n");
if !deps.is_empty() {
body.push_str(" <dependencies>\n");
for d in &deps {
body.push_str(" <dependency>\n");
body.push_str(&format!(" <groupId>{d}</groupId>\n"));
body.push_str(&format!(" <artifactId>{d}</artifactId>\n"));
body.push_str(" <version>LATEST</version>\n");
body.push_str(" </dependency>\n");
}
body.push_str(" </dependencies>\n");
}
body.push_str("</project>\n");
artifacts.push("pom.xml", body);
artifacts
}
fn is_java_stdlib(name: &str) -> bool {
// Best-effort: only `java` / `javax` / `sun` are guaranteed JDK.
// `jakarta` ships separately under Jakarta EE so it stays out.
// Top-level segments `com` / `org` cover both JDK (`com.sun`) and
// third-party (`com.google`, `org.springframework`) — the import
// extractor only keeps the first segment, so a richer registry has
// to land before we can pin a meaningful Maven artifact from these.
// Phase 10 corpus expansion ships that registry.
matches!(name, "java" | "javax" | "sun" | "com" | "org" | "jakarta")
}
/// Source of the `__nyx_probe` shim for the Java harness (Phase 06 —

View file

@ -19,9 +19,11 @@
//! Build: no compilation step. Command is `node harness.js`.
//! Build container: `nyx-build-node:{toolchain_id}` (deferred; §19.1).
use crate::dynamic::environment::{Environment, RuntimeArtifacts};
use crate::dynamic::lang::{HarnessSource, LangEmitter};
use crate::dynamic::spec::{EntryKind, HarnessSpec, PayloadSlot};
use crate::evidence::UnsupportedReason;
use crate::utils::project::DetectedFramework;
/// Zero-sized [`LangEmitter`] handle for JavaScript / TypeScript (one
/// emitter, both langs share the same Node.js dispatch). Method bodies
@ -47,6 +49,96 @@ impl LangEmitter for JavaScriptEmitter {
"javascript / typescript emitter supports {SUPPORTED:?}; this finding's enclosing context is `EntryKind::{attempted}` — Track B will add Express / Koa / Next shapes in phase 13"
)
}
fn materialize_runtime(&self, env: &Environment) -> RuntimeArtifacts {
materialize_node(env)
}
}
/// Phase 09 — Track D.2: emit a `package.json` covering every captured
/// dep plus the framework deps inferred from the manifest detector.
///
/// Versions default to `"*"` so npm resolves to a recent compatible
/// release. Re-used by the TypeScript emitter.
pub fn materialize_node(env: &Environment) -> RuntimeArtifacts {
let mut artifacts = RuntimeArtifacts::new();
let mut deps: Vec<(String, &'static str)> = Vec::new();
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
for d in &env.direct_deps {
if is_node_builtin(d) {
continue;
}
if seen.insert(d.clone()) {
deps.push((d.clone(), "*"));
}
}
for fw in &env.frameworks {
if let Some(name) = node_framework_pkg_name(*fw) {
if seen.insert(name.to_owned()) {
deps.push((name.to_owned(), "*"));
}
}
}
deps.sort_by(|a, b| a.0.cmp(&b.0));
let mut body = String::with_capacity(128);
body.push_str("{\n");
body.push_str(" \"name\": \"nyx-harness\",\n");
body.push_str(" \"version\": \"0.0.0\",\n");
body.push_str(" \"private\": true,\n");
body.push_str(" \"dependencies\": {\n");
for (i, (name, ver)) in deps.iter().enumerate() {
body.push_str(" \"");
body.push_str(name);
body.push_str("\": \"");
body.push_str(ver);
body.push('"');
if i + 1 != deps.len() {
body.push(',');
}
body.push('\n');
}
body.push_str(" }\n");
body.push_str("}\n");
artifacts.push("package.json", body);
artifacts
}
fn is_node_builtin(name: &str) -> bool {
matches!(
name,
"fs"
| "path"
| "http"
| "https"
| "url"
| "crypto"
| "stream"
| "util"
| "child_process"
| "os"
| "events"
| "buffer"
| "querystring"
| "zlib"
| "assert"
| "process"
| "net"
| "tls"
| "dns"
| "readline"
| "tty"
)
}
fn node_framework_pkg_name(fw: DetectedFramework) -> Option<&'static str> {
match fw {
DetectedFramework::Express => Some("express"),
DetectedFramework::Koa => Some("koa"),
DetectedFramework::Fastify => Some("fastify"),
_ => None,
}
}
/// Source of the `__nyx_probe` shim for the Node.js harness.

View file

@ -23,6 +23,7 @@ pub mod ruby;
pub mod rust;
pub mod typescript;
use crate::dynamic::environment::{Environment, RuntimeArtifacts};
use crate::dynamic::spec::{EntryKind, HarnessSpec};
use crate::evidence::UnsupportedReason;
use crate::symbol::Lang;
@ -76,6 +77,34 @@ pub trait LangEmitter {
/// keep it specific (name the supported kinds, name the phase that will
/// extend support).
fn entry_kind_hint(&self, attempted: EntryKind) -> String;
/// Synthesise the language-specific manifest / lockfile contents that
/// pin the [`Environment`]'s direct deps + toolchain into a file the
/// build sandbox can consume.
///
/// Default impl returns an empty bundle — every emitter that ships a
/// real build step overrides this (Python emits `requirements.txt`,
/// Rust emits a pinned `Cargo.toml`, etc.). The harness builder
/// writes every returned `(rel_path, content)` pair into the workdir
/// alongside the generated source.
///
/// Phase 09 - Track D.2 deliverable. The default keeps the surface
/// area additive: emitters that have not yet been wired through the
/// capture path simply produce no manifest and the build cache key
/// degrades to the existing lockfile-hash path.
fn materialize_runtime(&self, _env: &Environment) -> RuntimeArtifacts {
RuntimeArtifacts::default()
}
}
/// Public free-fn dispatcher for [`LangEmitter::materialize_runtime`].
///
/// Returns an empty [`RuntimeArtifacts`] when `env.lang` has no
/// registered emitter so callers do not need to special-case that path.
/// Used by the harness builder to fold runtime manifest artifacts into
/// the staged workdir (Phase 09 — Track D.2).
pub fn materialize_runtime(env: &Environment) -> RuntimeArtifacts {
dispatch(env.lang, |e| e.materialize_runtime(env)).unwrap_or_default()
}
/// Dispatch to the appropriate language emitter.

View file

@ -18,6 +18,7 @@
//! Build: no compilation step. Command is `php harness.php`.
//! Build container: `nyx-build-php:{toolchain_id}` (deferred; §19.1).
use crate::dynamic::environment::{Environment, RuntimeArtifacts};
use crate::dynamic::lang::{HarnessSource, LangEmitter};
use crate::dynamic::spec::{EntryKind, HarnessSpec, PayloadSlot};
use crate::evidence::UnsupportedReason;
@ -45,6 +46,40 @@ impl LangEmitter for PhpEmitter {
"php emitter supports {SUPPORTED:?}; this finding's enclosing context is `EntryKind::{attempted}` — Track B will add Slim / Laravel / Symfony route + CLI shapes in phase 15"
)
}
fn materialize_runtime(&self, env: &Environment) -> RuntimeArtifacts {
materialize_php(env)
}
}
/// Phase 09 — Track D.2: synthesise a `composer.json` with the captured
/// PHP version pin and (where known) the framework deps. Direct
/// imports of namespaced classes are too coarse to pin without a
/// vendor→package registry, so the manifest stays toolchain-only by
/// default; Phase 10 corpus expansion will introduce the registry.
pub fn materialize_php(env: &Environment) -> RuntimeArtifacts {
let mut artifacts = RuntimeArtifacts::new();
let php_ver = env
.toolchain
.version_string
.split('.')
.take(2)
.collect::<Vec<_>>()
.join(".");
let php_ver = if php_ver.is_empty() {
"8.1".to_owned()
} else {
php_ver
};
let mut body = String::with_capacity(128);
body.push_str("{\n");
body.push_str(" \"name\": \"nyx/harness\",\n");
body.push_str(" \"require\": {\n");
body.push_str(&format!(" \"php\": \">={php_ver}\"\n"));
body.push_str(" }\n");
body.push_str("}\n");
artifacts.push("composer.json", body);
artifacts
}
/// Source of the `__nyx_probe` shim for the PHP harness (Phase 06 —

View file

@ -13,9 +13,11 @@
//! - `PayloadSlot::EnvVar(name)` — set env var before calling.
//! - Other slots produce `UnsupportedReason::PayloadSlotUnsupported`.
use crate::dynamic::environment::{Environment, RuntimeArtifacts};
use crate::dynamic::lang::{HarnessSource, LangEmitter};
use crate::dynamic::spec::{EntryKind, HarnessSpec, PayloadSlot};
use crate::evidence::UnsupportedReason;
use crate::utils::project::DetectedFramework;
/// Zero-sized [`LangEmitter`] handle for Python. Registered in the
/// `lang::dispatch` table; method bodies delegate to the existing free
@ -40,6 +42,14 @@ impl LangEmitter for PythonEmitter {
"python emitter supports {SUPPORTED:?}; this finding's enclosing context is `EntryKind::{attempted}` — Track B will add framework + CLI shapes in phase 12"
)
}
/// Phase 09 — Track D.2: emit a pinned `requirements.txt` (and a
/// matching `pyproject.toml` stub when `pyproject.toml` is the
/// project's canonical manifest) covering every captured direct dep
/// plus the framework deps inferred from the project manifest.
fn materialize_runtime(&self, env: &Environment) -> RuntimeArtifacts {
materialize_python(env)
}
}
/// Source of the `__nyx_probe` shim for the Python harness.
@ -168,6 +178,163 @@ def __nyx_install_crash_guard(sink_callee):
"#
}
/// Phase 09 - Track D.2: synthesise a `requirements.txt` from the
/// captured deps in `env`.
///
/// The output is a deterministic, alphabetised listing of every
/// non-stdlib direct dep the entry file imported plus the framework deps
/// inferred from the manifest detector. Each entry is emitted as the
/// canonical pip-installable name; version pins are intentionally
/// omitted so the system pip resolves the latest compatible release
/// against the user's pinned Python interpreter (the spec's
/// `toolchain_id` field). A future phase can fold pinned versions in
/// once the capture pass learns to parse the project's own lockfile.
pub fn materialize_python(env: &Environment) -> RuntimeArtifacts {
let mut artifacts = RuntimeArtifacts::new();
let mut deps: Vec<String> = Vec::new();
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
// Direct imports first — these mirror the entry file faithfully.
for d in &env.direct_deps {
if is_python_stdlib(d) {
continue;
}
let canonical = canonical_python_pkg_name(d);
if seen.insert(canonical.clone()) {
deps.push(canonical);
}
}
// Framework deps next — these may not appear as direct imports in
// every entry file, but they have to be installed for the runtime
// to resolve framework decorators.
for fw in &env.frameworks {
if let Some(name) = python_framework_pkg_name(*fw) {
let canonical = canonical_python_pkg_name(name);
if seen.insert(canonical.clone()) {
deps.push(canonical);
}
}
}
deps.sort_unstable();
let mut body = String::with_capacity(64);
body.push_str("# Auto-generated by Nyx — Phase 09 (Track D.2).\n");
body.push_str(&format!("# spec_hash = {}\n", env.spec_hash));
body.push_str(&format!(
"# toolchain = {} (drift={})\n",
env.toolchain.toolchain_id, env.toolchain.toolchain_drift
));
for d in &deps {
body.push_str(d);
body.push('\n');
}
artifacts.push("requirements.txt", body);
artifacts
}
/// Returns true when `name` is a Python standard-library top-level
/// package. Conservative: matches the names the harness build path
/// would silently drop from `requirements.txt` anyway.
fn is_python_stdlib(name: &str) -> bool {
matches!(
name,
"abc"
| "argparse"
| "asyncio"
| "base64"
| "binascii"
| "collections"
| "contextlib"
| "copy"
| "csv"
| "ctypes"
| "dataclasses"
| "datetime"
| "decimal"
| "difflib"
| "email"
| "enum"
| "errno"
| "fcntl"
| "fnmatch"
| "functools"
| "getopt"
| "getpass"
| "glob"
| "gzip"
| "hashlib"
| "hmac"
| "http"
| "importlib"
| "inspect"
| "io"
| "ipaddress"
| "itertools"
| "json"
| "logging"
| "math"
| "multiprocessing"
| "operator"
| "os"
| "pathlib"
| "pickle"
| "platform"
| "posixpath"
| "queue"
| "random"
| "re"
| "secrets"
| "select"
| "shutil"
| "signal"
| "socket"
| "sqlite3"
| "ssl"
| "stat"
| "string"
| "struct"
| "subprocess"
| "sys"
| "tempfile"
| "threading"
| "time"
| "traceback"
| "types"
| "typing"
| "unicodedata"
| "unittest"
| "urllib"
| "uuid"
| "warnings"
| "weakref"
| "xml"
| "zipfile"
| "zlib"
)
}
/// Canonicalise common Python pkg aliases to their PyPI distribution
/// name (e.g. `cv2` → `opencv-python`).
fn canonical_python_pkg_name(name: &str) -> String {
let lower = name.to_ascii_lowercase();
match lower.as_str() {
"flask" => "Flask".to_owned(),
"cv2" => "opencv-python".to_owned(),
"sqlalchemy" => "SQLAlchemy".to_owned(),
"yaml" => "PyYAML".to_owned(),
"psycopg2" => "psycopg2-binary".to_owned(),
_ => lower,
}
}
fn python_framework_pkg_name(fw: DetectedFramework) -> Option<&'static str> {
match fw {
DetectedFramework::Flask => Some("flask"),
DetectedFramework::Django => Some("django"),
_ => None,
}
}
/// Emit a Python harness for `spec`.
pub fn emit(spec: &HarnessSpec) -> Result<HarnessSource, UnsupportedReason> {
// Validate payload slot.

View file

@ -8,6 +8,7 @@
//! a structured `Inconclusive(EntryKindUnsupported { … })` instead of
//! silently dropping Ruby findings.
use crate::dynamic::environment::{Environment, RuntimeArtifacts};
use crate::dynamic::lang::{HarnessSource, LangEmitter};
use crate::dynamic::spec::{EntryKind, HarnessSpec};
use crate::evidence::UnsupportedReason;
@ -125,6 +126,61 @@ impl LangEmitter for RubyEmitter {
"ruby emitter is a stub; once Phase 15 (Track B Ruby vertical) lands it will support {SUPPORTED:?} plus Sinatra / Rails / Rack route shapes — attempted `EntryKind::{attempted}`"
)
}
fn materialize_runtime(&self, env: &Environment) -> RuntimeArtifacts {
materialize_ruby(env)
}
}
/// Phase 09 — Track D.2: synthesise a `Gemfile` listing every captured
/// gem name. Ruby `require` statements give us first-segment package
/// names directly so the manifest can name real gems.
pub fn materialize_ruby(env: &Environment) -> RuntimeArtifacts {
let mut artifacts = RuntimeArtifacts::new();
let mut deps: Vec<String> = Vec::new();
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
for d in &env.direct_deps {
if is_ruby_stdlib(d) {
continue;
}
if seen.insert(d.clone()) {
deps.push(d.clone());
}
}
deps.sort_unstable();
let mut body = String::with_capacity(64);
body.push_str("source 'https://rubygems.org'\n");
for d in &deps {
body.push_str(&format!("gem '{d}'\n"));
}
artifacts.push("Gemfile", body);
artifacts
}
fn is_ruby_stdlib(name: &str) -> bool {
matches!(
name,
"json"
| "yaml"
| "uri"
| "net"
| "time"
| "date"
| "csv"
| "logger"
| "fileutils"
| "tempfile"
| "open"
| "stringio"
| "set"
| "open3"
| "ostruct"
| "digest"
| "base64"
| "securerandom"
| "etc"
)
}
#[cfg(test)]

View file

@ -21,6 +21,7 @@
//!
//! HTML_ESCAPE is n/a for Rust (§15.4).
use crate::dynamic::environment::{Environment, RuntimeArtifacts};
use crate::dynamic::lang::{HarnessSource, LangEmitter};
use crate::dynamic::spec::{EntryKind, HarnessSpec, PayloadSlot};
use crate::evidence::UnsupportedReason;
@ -49,6 +50,53 @@ impl LangEmitter for RustEmitter {
"rust emitter supports {SUPPORTED:?}; this finding's enclosing context is `EntryKind::{attempted}` — Track B will add actix / axum / clap / libfuzzer shapes in phase 16"
)
}
fn materialize_runtime(&self, env: &Environment) -> RuntimeArtifacts {
materialize_rust(env)
}
}
/// Phase 09 — Track D.2: synthesise a `Cargo.toml` that pins every
/// captured crate dep. The base cap-driven dep set lives in
/// [`generate_cargo_toml`]; this function layers the user's direct
/// crate imports on top so the harness build can resolve symbols from
/// crates the entry actually uses.
pub fn materialize_rust(env: &Environment) -> RuntimeArtifacts {
let mut artifacts = RuntimeArtifacts::new();
let mut deps: Vec<String> = Vec::new();
let mut seen: std::collections::HashSet<String> = std::collections::HashSet::new();
for d in &env.direct_deps {
if is_rust_stdlib(d) {
continue;
}
if seen.insert(d.clone()) {
deps.push(d.clone());
}
}
deps.sort_unstable();
let mut body = String::with_capacity(256);
body.push_str("[package]\n");
body.push_str("name = \"nyx-harness\"\n");
body.push_str("version = \"0.1.0\"\n");
body.push_str("edition = \"2021\"\n\n");
body.push_str("[[bin]]\n");
body.push_str("name = \"nyx_harness\"\n");
body.push_str("path = \"src/main.rs\"\n\n");
body.push_str("[dependencies]\n");
for d in &deps {
body.push_str(d);
body.push_str(" = \"*\"\n");
}
artifacts.push("Cargo.toml", body);
artifacts
}
fn is_rust_stdlib(name: &str) -> bool {
matches!(
name,
"std" | "core" | "alloc" | "proc_macro" | "test" | "self" | "super" | "crate"
)
}
/// Source of the `__nyx_probe` shim for the Rust harness (Phase 06 —

View file

@ -15,6 +15,7 @@
//! land, the supported list / hint shift here without affecting the JS
//! emitter.
use crate::dynamic::environment::{Environment, RuntimeArtifacts};
use crate::dynamic::lang::{javascript, HarnessSource, LangEmitter};
use crate::dynamic::spec::{EntryKind, HarnessSpec};
use crate::evidence::UnsupportedReason;
@ -50,6 +51,10 @@ impl LangEmitter for TypeScriptEmitter {
"typescript emitter supports {SUPPORTED:?} (delegates to the JavaScript emitter); this finding's enclosing context is `EntryKind::{attempted}` — Track B will add Next.js / jsdom shapes in phase 13"
)
}
fn materialize_runtime(&self, env: &Environment) -> RuntimeArtifacts {
javascript::materialize_node(env)
}
}
#[cfg(test)]

View file

@ -68,6 +68,7 @@
pub mod build_sandbox;
pub mod corpus;
pub mod differential;
pub mod environment;
pub mod harness;
pub mod lang;
pub mod mount_filter;