diff --git a/build.rs b/build.rs index 34f4a9b1..66f99fad 100644 --- a/build.rs +++ b/build.rs @@ -1,8 +1,15 @@ +use std::collections::BTreeMap; use std::path::Path; use std::process::Command; fn main() { - // Only relevant when the serve feature is active + // Phase 17 (Track E.1): always emit the seccomp policy table to + // OUT_DIR. Gated runtime via `#[cfg(target_os = "linux")]`, but the + // codegen runs on every host so `cargo check` on macOS still emits + // the file (the include never actually compiles on non-Linux). + emit_seccomp_policy(); + + // Only relevant when the serve feature is active. if std::env::var("CARGO_FEATURE_SERVE").is_err() { return; } @@ -70,3 +77,209 @@ fn emit_placeholder_and_warn(dist_dir: &Path) { "cargo:warning=Node.js/npm not available — wrote placeholder frontend assets. Run 'cd frontend && npm install && npm run build' for the real UI." ); } + +// ── Phase 17 (Track E.1) — seccomp policy codegen ──────────────────────────── + +const SECCOMP_POLICY_PATH: &str = "src/dynamic/sandbox/seccomp/seccomp_policy.toml"; + +/// Cap-name → Cap bit value table. Mirrors the `bitflags!` block in +/// `src/labels/mod.rs`. Keep in sync when adding/removing `Cap` +/// constants. +const CAP_BIT_FOR_NAME: &[(&str, u32)] = &[ + ("ENV_VAR", 1 << 0), + ("HTML_ESCAPE", 1 << 1), + ("SHELL_ESCAPE", 1 << 2), + ("URL_ENCODE", 1 << 3), + ("JSON_PARSE", 1 << 4), + ("FILE_IO", 1 << 5), + ("FMT_STRING", 1 << 6), + ("SQL_QUERY", 1 << 7), + ("DESERIALIZE", 1 << 8), + ("SSRF", 1 << 9), + ("CODE_EXEC", 1 << 10), + ("CRYPTO", 1 << 11), + ("UNAUTHORIZED_ID", 1 << 12), + ("DATA_EXFIL", 1 << 13), + ("LDAP_INJECTION", 1 << 14), + ("XPATH_INJECTION", 1 << 15), + ("HEADER_INJECTION", 1 << 16), + ("OPEN_REDIRECT", 1 << 17), + ("SSTI", 1 << 18), + ("XXE", 1 << 19), + ("PROTOTYPE_POLLUTION", 1 << 20), +]; + +fn emit_seccomp_policy() { + println!("cargo:rerun-if-changed={}", SECCOMP_POLICY_PATH); + + let out_dir = std::env::var("OUT_DIR").expect("OUT_DIR must be set by cargo"); + let out_path = Path::new(&out_dir).join("seccomp_policy.rs"); + + // Read the policy file; on missing file (e.g. fresh checkout on a + // foreign target), emit empty tables so compilation still succeeds. + let toml_text = match std::fs::read_to_string(SECCOMP_POLICY_PATH) { + Ok(s) => s, + Err(_) => { + std::fs::write( + &out_path, + "pub static BASE: &[&str] = &[];\npub static CAP: &[(u32, &[&str])] = &[];\n", + ) + .expect("write empty seccomp policy stub"); + return; + } + }; + + let parsed = parse_seccomp_toml(&toml_text); + + let mut out = String::new(); + out.push_str("// generated by build.rs from seccomp_policy.toml — do not edit\n\n"); + + // Base allowlist. + out.push_str("pub static BASE: &[&str] = &[\n"); + for name in &parsed.base { + out.push_str(&format!(" \"{}\",\n", escape(name))); + } + out.push_str("];\n\n"); + + // Per-cap allowlists. + out.push_str("pub static CAP: &[(u32, &[&str])] = &[\n"); + for (cap_name, allow) in &parsed.caps { + let bit = CAP_BIT_FOR_NAME + .iter() + .find(|(n, _)| *n == cap_name.as_str()) + .map(|(_, b)| *b) + .unwrap_or_else(|| panic!( + "seccomp_policy.toml references unknown Cap '{cap_name}' — \ + add it to CAP_BIT_FOR_NAME in build.rs first" + )); + out.push_str(&format!(" (0x{bit:08x}_u32, &[\n")); + for name in allow { + out.push_str(&format!(" \"{}\",\n", escape(name))); + } + out.push_str(" ]),\n"); + } + out.push_str("];\n"); + + std::fs::write(&out_path, out).expect("write seccomp policy table"); +} + +#[derive(Default)] +struct SeccompPolicy { + base: Vec, + caps: BTreeMap>, +} + +/// Tiny line-oriented TOML parser scoped to the shape used by +/// `seccomp_policy.toml`: +/// +/// [base] +/// allow = ["read", "write", ...] +/// +/// [cap.SQL_QUERY] +/// allow = [ +/// "fdatasync", +/// ... +/// ] +/// +/// Comments (`#`) and blank lines are skipped. Multi-line array bodies +/// are accumulated until the closing `]`. +fn parse_seccomp_toml(src: &str) -> SeccompPolicy { + let mut policy = SeccompPolicy::default(); + let mut current_section: Option = None; + let mut accumulating_array: Option = None; + let mut array_buf = String::new(); + + for raw_line in src.lines() { + let line = strip_comment(raw_line).trim(); + if line.is_empty() { + continue; + } + + if let Some(_key) = accumulating_array.as_ref() { + array_buf.push_str(line); + array_buf.push('\n'); + if line.contains(']') { + let key = accumulating_array.take().unwrap(); + let values = parse_string_array(&array_buf); + store_allow(&mut policy, current_section.as_deref(), &key, values); + array_buf.clear(); + } + continue; + } + + if let Some(section) = line.strip_prefix('[').and_then(|s| s.strip_suffix(']')) { + current_section = Some(section.to_string()); + continue; + } + + if let Some((key, rest)) = line.split_once('=') { + let key = key.trim().to_string(); + let rest = rest.trim(); + if rest.starts_with('[') && rest.contains(']') { + let values = parse_string_array(rest); + store_allow(&mut policy, current_section.as_deref(), &key, values); + } else if rest.starts_with('[') { + accumulating_array = Some(key); + array_buf.push_str(rest); + array_buf.push('\n'); + } + continue; + } + } + + policy +} + +fn strip_comment(line: &str) -> &str { + let mut in_string = false; + let bytes = line.as_bytes(); + for (i, &b) in bytes.iter().enumerate() { + match b { + b'"' => in_string = !in_string, + b'#' if !in_string => return &line[..i], + _ => {} + } + } + line +} + +fn parse_string_array(src: &str) -> Vec { + // Find every "..." run between the first `[` and the last `]`. + let start = src.find('[').map(|i| i + 1).unwrap_or(0); + let end = src.rfind(']').unwrap_or(src.len()); + let body = &src[start..end]; + let mut out = Vec::new(); + let mut chars = body.chars().peekable(); + while let Some(c) = chars.next() { + if c == '"' { + let mut s = String::new(); + for c2 in chars.by_ref() { + if c2 == '"' { + break; + } + s.push(c2); + } + out.push(s); + } + } + out +} + +fn store_allow(policy: &mut SeccompPolicy, section: Option<&str>, key: &str, values: Vec) { + if key != "allow" { + return; + } + match section { + Some("base") => policy.base = values, + Some(other) => { + if let Some(cap_name) = other.strip_prefix("cap.") { + policy.caps.insert(cap_name.to_string(), values); + } + } + None => {} + } +} + +fn escape(s: &str) -> String { + s.replace('\\', "\\\\").replace('"', "\\\"") +} diff --git a/src/dynamic/sandbox.rs b/src/dynamic/sandbox/mod.rs similarity index 95% rename from src/dynamic/sandbox.rs rename to src/dynamic/sandbox/mod.rs index b2cd479a..72bd3c98 100644 --- a/src/dynamic/sandbox.rs +++ b/src/dynamic/sandbox/mod.rs @@ -29,6 +29,14 @@ use std::path::Path; use std::sync::{Arc, OnceLock}; use std::time::{Duration, Instant}; +#[cfg(target_os = "linux")] +pub mod process_linux; +#[cfg(target_os = "linux")] +pub mod seccomp; + +#[cfg(target_os = "linux")] +pub use process_linux::{HardeningLevel, HardeningOutcome}; + // ── Harness interpretation probe ────────────────────────────────────────────── /// Returns true when the harness is driven by an interpreter (Python, Node, …) @@ -159,6 +167,40 @@ pub struct SandboxOptions { /// into [`crate::dynamic::oracle::oracle_fired_with_stubs`]. /// `None` when the spec's `stubs_required` is empty. pub stub_harness: Option>, + /// Phase 17 (Track E.1): cap bits used to minimise the seccomp-bpf + /// allowlist applied to the Linux process backend. When `0`, the + /// process backend installs only the cap-independent `base` allowlist + /// from [`seccomp::seccomp_policy.toml`]; when non-zero, every cap bit + /// set adds its allowlisted syscalls on top. Other backends ignore + /// this field. + pub seccomp_caps: u32, + /// Phase 17 (Track E.1): hardening profile applied by the Linux + /// process backend. See [`ProcessHardeningProfile`] for the per- + /// variant primitive matrix. + pub process_hardening: ProcessHardeningProfile, +} + +/// Phase 17 (Track E.1): selects which subset of the Linux process- +/// backend hardening primitives is applied. +/// +/// - [`ProcessHardeningProfile::Standard`] — the historical baseline: +/// `prctl(PR_SET_NO_NEW_PRIVS)` + `setrlimit(RLIMIT_AS)` only. No +/// namespaces, no chroot, no seccomp. Default for back-compat. +/// - [`ProcessHardeningProfile::Strict`] — full Phase 17 sequence: +/// no-new-privs, all rlimits, namespace unshare, chroot to workdir, +/// default-deny seccomp filter scoped to [`SandboxOptions::seccomp_caps`]. +/// Each primitive is best-effort; failures degrade to +/// [`HardeningLevel::Partial`] without aborting the run. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ProcessHardeningProfile { + Standard, + Strict, +} + +impl Default for ProcessHardeningProfile { + fn default() -> Self { + ProcessHardeningProfile::Standard + } } impl SandboxOptions { @@ -186,6 +228,8 @@ impl Default for SandboxOptions { probe_channel: None, extra_env: Vec::new(), stub_harness: None, + seccomp_caps: 0, + process_hardening: ProcessHardeningProfile::Standard, } } } @@ -1207,25 +1251,35 @@ fn run_process( cmd.env("NYX_PAYLOAD", std::ffi::OsStr::from_bytes(payload_bytes)); } - // Enforce memory cap before exec on Linux via RLIMIT_AS + PR_SET_NO_NEW_PRIVS. - // RLIMIT_AS limits total virtual address space. Python uses significantly - // more virtual AS than RSS (shared libs, mmap arenas), so the enforced - // limit is memory_mib * 8 with a floor of 4 GiB. + // Phase 17 (Track E.1): install the Linux process-backend hardening + // sequence — `prctl(PR_SET_NO_NEW_PRIVS)`, `setrlimit` (CPU/NOFILE/AS), + // `unshare(CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUSER)`, `chroot` to the + // workdir, and a default-deny seccomp-bpf filter scoped to + // `opts.seccomp_caps`. Each primitive is best-effort: failures + // downgrade to `HardeningLevel::Partial` instead of aborting the run. #[cfg(target_os = "linux")] - { - use std::os::unix::process::CommandExt; - let memory_mib = opts.memory_mib; - // Safety: called in the child after fork but before exec; no allocator use. - unsafe { - cmd.pre_exec(move || { - rlimit_as_linux(memory_mib)?; - prctl_no_new_privs() - }); - } - } + let collector = process_linux::install_pre_exec(&mut cmd, opts, &harness.workdir); let start = Instant::now(); - let mut child = cmd.spawn().map_err(SandboxError::Spawn)?; + let child_result = cmd.spawn(); + #[cfg(target_os = "linux")] + let outcome_joiner; + let mut child = match child_result { + Ok(c) => { + #[cfg(target_os = "linux")] + { + outcome_joiner = collector.map(|c| c.after_spawn()); + } + c + } + Err(e) => { + #[cfg(target_os = "linux")] + if let Some(c) = collector { + c.forget(); + } + return Err(SandboxError::Spawn(e)); + } + }; let timeout = opts.timeout; let timed_out = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)); @@ -1270,6 +1324,14 @@ fn run_process( let status = child.wait().map_err(SandboxError::Io)?; + // Phase 17 (Track E.1): wait for the per-primitive HardeningOutcome + // drain thread before returning so callers (tests + telemetry) read + // a settled value via `process_linux::last_hardening_outcome()`. + #[cfg(target_os = "linux")] + if let Some(joiner) = outcome_joiner { + joiner.await_outcome(); + } + let stdout_buf = stdout_handle .and_then(|h| h.join().ok()) .and_then(|r| r.ok()) @@ -1337,52 +1399,9 @@ fn base64_encode(data: &[u8]) -> String { // ── Linux-specific syscall wrappers ────────────────────────────────────────── -/// Set RLIMIT_AS (virtual address space) in a `pre_exec` context on Linux. -/// -/// `memory_mib` is the configured cap; we enforce `max(memory_mib * 8, 4096)` -/// MiB of virtual AS to give Python's mmap-heavy runtime adequate headroom -/// while still capping runaway memory bombs. -/// -/// RLIMIT_AS = 9 on x86_64, aarch64, arm, ppc64, s390x, and all other major -/// Linux architectures (kernel source: include/uapi/asm-generic/resource.h). -#[cfg(target_os = "linux")] -fn rlimit_as_linux(memory_mib: u64) -> std::io::Result<()> { - #[repr(C)] - struct Rlimit { - cur: u64, - max: u64, - } - unsafe extern "C" { - fn setrlimit(resource: i32, rlim: *const Rlimit) -> i32; - } - const RLIMIT_AS: i32 = 9; - let cap_mib = memory_mib.saturating_mul(8).max(4096); - let bytes = cap_mib.saturating_mul(1024 * 1024); - let rl = Rlimit { cur: bytes, max: bytes }; - let ret = unsafe { setrlimit(RLIMIT_AS, &rl) }; - if ret == 0 { - Ok(()) - } else { - Err(std::io::Error::last_os_error()) - } -} - -/// Set PR_SET_NO_NEW_PRIVS to 1 in a `pre_exec` context on Linux. -/// -/// This prevents the child process from acquiring new privileges via setuid -/// binaries, file capabilities, or ptrace. Best-effort: silently succeeds -/// even if the prctl call fails (e.g., in restricted environments). -#[cfg(target_os = "linux")] -fn prctl_no_new_privs() -> std::io::Result<()> { - unsafe extern "C" { - fn prctl(option: i32, arg2: u64, arg3: u64, arg4: u64, arg5: u64) -> i32; - } - const PR_SET_NO_NEW_PRIVS: i32 = 38; - // Failure is non-fatal: some container runtimes block prctl but are - // themselves already sandboxed. Don't abort the child for this. - unsafe { prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) }; - Ok(()) -} +// `rlimit_as_linux`, `prctl_no_new_privs`, and the rest of the Linux process +// backend hardening sequence now live in [`process_linux`]. See +// [`process_linux::install_pre_exec`] for the call-site. #[cfg(unix)] fn libc_kill(pid: i32, sig: i32) -> i32 { diff --git a/src/dynamic/sandbox/process_linux.rs b/src/dynamic/sandbox/process_linux.rs new file mode 100644 index 00000000..9d2b5a88 --- /dev/null +++ b/src/dynamic/sandbox/process_linux.rs @@ -0,0 +1,657 @@ +//! Phase 17 (Track E.1) — Linux process backend hardening. +//! +//! Owns the `pre_exec` sequence applied to every harness child started by +//! [`super::run_process`] on Linux: +//! +//! 1. `prctl(PR_SET_NO_NEW_PRIVS)` — block setuid / file-cap escalation. +//! 2. `setrlimit(RLIMIT_CPU)` — cap CPU time so a runaway payload exits. +//! 3. `setrlimit(RLIMIT_NOFILE)` — cap open fds; the harness receives only +//! a small number of stdio + probe fds from the parent. +//! 4. `setrlimit(RLIMIT_AS)` — cap virtual address space; multiplied by 8 +//! with a 4 GiB floor so interpreted runtimes still start. +//! 5. `unshare(CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNS)` — drop the +//! host PID, mount, and user namespace views. +//! 6. `chroot(workdir)` + `chdir("/")` — isolate filesystem reach to the +//! harness workdir; payloads that try to read `/etc/passwd` see the +//! harness root, not the host one. +//! 7. seccomp-bpf default-deny filter scoped to the cap bits the spec +//! actually exercises (see [`super::seccomp`]). +//! +//! Each primitive is best-effort: failures are recorded into the per- +//! child [`HardeningOutcome`] file the parent reads back after exec, so +//! the verifier can downgrade to [`HardeningLevel::Partial`] without +//! aborting the harness run. +//! +//! The pre_exec callback runs in the child between fork(2) and execve(2) +//! — no Rust allocator use, no heap-borrowing closures. Anything the +//! parent needs to know is shipped through an `O_CLOEXEC` pipe the +//! parent owns the read end of: the child writes one [`HardeningOutcome`] +//! record into it, execve(2) drops the write end, and the parent's +//! drain thread sees EOF and records the outcome. + +use crate::dynamic::sandbox::seccomp; +use crate::dynamic::sandbox::seccomp::bpf::SockFilter; +use crate::dynamic::sandbox::{ProcessHardeningProfile, SandboxOptions}; +use std::io::Read; +use std::os::unix::io::{FromRawFd, RawFd}; +use std::os::unix::process::CommandExt; +use std::path::{Path, PathBuf}; +use std::process::Command; +use std::sync::{Arc, Mutex, OnceLock}; + +// ── HardeningLevel reporting ───────────────────────────────────────────────── + +/// Coarse summary of which Phase 17 primitives applied successfully. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum HardeningLevel { + /// Standard profile selected — only no-new-privs + RLIMIT_AS were + /// installed (no Phase 17 hardening attempted). + Baseline, + /// All requested primitives applied successfully. + Full, + /// At least one primitive failed (typically because the process is + /// already inside a sandbox that disallows e.g. `unshare`). + Partial, + /// Every primitive failed; the harness ran with no Phase 17 + /// hardening at all. + None, +} + +/// Per-primitive outcome captured by the child and read back by the +/// parent after `wait`. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct HardeningOutcome { + pub no_new_privs: PrimitiveStatus, + pub rlimit_cpu: PrimitiveStatus, + pub rlimit_nofile: PrimitiveStatus, + pub rlimit_as: PrimitiveStatus, + pub unshare: PrimitiveStatus, + pub chroot: PrimitiveStatus, + pub seccomp: PrimitiveStatus, + pub profile: ProcessHardeningProfileTag, +} + +impl Default for HardeningOutcome { + fn default() -> Self { + Self { + no_new_privs: PrimitiveStatus::Skipped, + rlimit_cpu: PrimitiveStatus::Skipped, + rlimit_nofile: PrimitiveStatus::Skipped, + rlimit_as: PrimitiveStatus::Skipped, + unshare: PrimitiveStatus::Skipped, + chroot: PrimitiveStatus::Skipped, + seccomp: PrimitiveStatus::Skipped, + profile: ProcessHardeningProfileTag::Standard, + } + } +} + +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub enum PrimitiveStatus { + /// Primitive was not requested by the active profile. + #[default] + Skipped, + /// Primitive applied successfully. + Applied, + /// Primitive call returned an error; raw errno is captured below. + Failed(i32), +} + +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub enum ProcessHardeningProfileTag { + #[default] + Standard, + Strict, +} + +impl HardeningOutcome { + /// Coarse summary used for the `HardeningLevel` column. + pub fn level(&self) -> HardeningLevel { + if matches!(self.profile, ProcessHardeningProfileTag::Standard) { + return HardeningLevel::Baseline; + } + let primitives = [ + self.no_new_privs, + self.rlimit_cpu, + self.rlimit_nofile, + self.rlimit_as, + self.unshare, + self.chroot, + self.seccomp, + ]; + let applied = primitives.iter().filter(|s| matches!(s, PrimitiveStatus::Applied)).count(); + let failed = primitives.iter().filter(|s| matches!(s, PrimitiveStatus::Failed(_))).count(); + match (applied, failed) { + (_, 0) => HardeningLevel::Full, + (0, _) => HardeningLevel::None, + _ => HardeningLevel::Partial, + } + } +} + +// ── Last outcome registry (read back by tests + telemetry) ─────────────────── + +static LAST_OUTCOME: OnceLock>> = OnceLock::new(); + +fn outcome_cell() -> &'static Mutex> { + LAST_OUTCOME.get_or_init(|| Mutex::new(None)) +} + +fn record_outcome(outcome: HardeningOutcome) { + if let Ok(mut g) = outcome_cell().lock() { + *g = Some(outcome); + } +} + +/// Snapshot of the most-recent hardening outcome. Returns `None` until +/// at least one [`install_pre_exec`] child has been spawned and waited +/// on. Tests + telemetry read this after `wait_for_outcome` to get the +/// per-primitive status table. +pub fn last_hardening_outcome() -> Option { + outcome_cell().lock().ok().and_then(|g| *g) +} + +/// Reset the last-outcome slot. Tests use this between cases so a stale +/// value from a prior spawn cannot leak into the assertion under test. +pub fn reset_last_hardening_outcome() { + if let Ok(mut g) = outcome_cell().lock() { + *g = None; + } +} + +// ── Status pipe between parent and child ───────────────────────────────────── + +struct StatusPipe { + write_fd: RawFd, + read_fd: RawFd, +} + +impl StatusPipe { + fn new() -> std::io::Result { + unsafe extern "C" { + fn pipe2(pipefd: *mut i32, flags: i32) -> i32; + } + const O_CLOEXEC: i32 = 0o2_000_000; + let mut fds = [-1_i32; 2]; + let ret = unsafe { pipe2(fds.as_mut_ptr(), O_CLOEXEC) }; + if ret != 0 { + return Err(std::io::Error::last_os_error()); + } + Ok(Self { write_fd: fds[1], read_fd: fds[0] }) + } +} + +fn close_fd(fd: RawFd) { + unsafe extern "C" { + fn close(fd: i32) -> i32; + } + unsafe { close(fd) }; +} + +/// Drain `read_fd` into a `HardeningOutcome`. Wire format is the +/// 15-byte fixed-width record produced by [`encode_outcome`]. +fn drain_outcome(read_fd: RawFd) -> Option { + let mut file = unsafe { std::fs::File::from_raw_fd(read_fd) }; + let mut buf = Vec::with_capacity(64); + if file.read_to_end(&mut buf).is_err() { + return None; + } + decode_outcome(&buf) +} + +const OUTCOME_LEN: usize = 1 + 7 * 2; + +/// Decode a 15-byte hardening outcome record: +/// `[profile_tag, no_new_privs_tag, no_new_privs_errno_lo, +/// rlimit_cpu_tag, rlimit_cpu_errno_lo, ..., seccomp_tag, seccomp_errno_lo]` +/// All errnos are clamped to the low byte for the wire (true value is +/// recovered post-hoc from `errno`-symbolic context if needed). +fn decode_outcome(buf: &[u8]) -> Option { + if buf.len() < OUTCOME_LEN { + return None; + } + let profile = match buf[0] { + 1 => ProcessHardeningProfileTag::Strict, + _ => ProcessHardeningProfileTag::Standard, + }; + let mut idx = 1; + let mut next = || -> PrimitiveStatus { + let tag = buf[idx]; + let errno = buf[idx + 1] as i32; + idx += 2; + match tag { + 0 => PrimitiveStatus::Skipped, + 1 => PrimitiveStatus::Applied, + _ => PrimitiveStatus::Failed(if errno == 0 { -1 } else { errno }), + } + }; + let no_new_privs = next(); + let rlimit_cpu = next(); + let rlimit_nofile = next(); + let rlimit_as = next(); + let unshare = next(); + let chroot = next(); + let seccomp = next(); + Some(HardeningOutcome { + no_new_privs, + rlimit_cpu, + rlimit_nofile, + rlimit_as, + unshare, + chroot, + seccomp, + profile, + }) +} + +fn encode_outcome(out: &HardeningOutcome) -> [u8; OUTCOME_LEN] { + let mut buf = [0_u8; OUTCOME_LEN]; + buf[0] = match out.profile { + ProcessHardeningProfileTag::Standard => 0, + ProcessHardeningProfileTag::Strict => 1, + }; + let mut idx = 1; + for status in [ + out.no_new_privs, + out.rlimit_cpu, + out.rlimit_nofile, + out.rlimit_as, + out.unshare, + out.chroot, + out.seccomp, + ] { + let (tag, errno) = match status { + PrimitiveStatus::Skipped => (0_u8, 0_u8), + PrimitiveStatus::Applied => (1_u8, 0_u8), + PrimitiveStatus::Failed(e) => (2_u8, (e.unsigned_abs() & 0xff) as u8), + }; + buf[idx] = tag; + buf[idx + 1] = errno; + idx += 2; + } + buf +} + +// ── Primitive wrappers (called from the child's pre_exec) ──────────────────── + +const RLIMIT_CPU: i32 = 0; +const RLIMIT_NOFILE: i32 = 7; +const RLIMIT_AS: i32 = 9; + +const PR_SET_NO_NEW_PRIVS: i32 = 38; + +const CLONE_NEWNS: i32 = 0x0002_0000; +const CLONE_NEWUSER: i32 = 0x1000_0000; +const CLONE_NEWPID: i32 = 0x2000_0000; + +#[repr(C)] +struct Rlimit { + cur: u64, + max: u64, +} + +unsafe extern "C" { + fn setrlimit(resource: i32, rlim: *const Rlimit) -> i32; + fn prctl(option: i32, arg2: u64, arg3: u64, arg4: u64, arg5: u64) -> i32; + fn unshare(flags: i32) -> i32; + fn chroot(path: *const i8) -> i32; + fn chdir(path: *const i8) -> i32; + fn write(fd: i32, buf: *const u8, count: usize) -> isize; + fn __errno_location() -> *mut i32; +} + +fn last_errno() -> i32 { + unsafe { *__errno_location() } +} + +fn apply_rlimit(resource: i32, bytes: u64) -> PrimitiveStatus { + let rl = Rlimit { cur: bytes, max: bytes }; + let ret = unsafe { setrlimit(resource, &rl) }; + if ret == 0 { + PrimitiveStatus::Applied + } else { + PrimitiveStatus::Failed(last_errno()) + } +} + +fn apply_no_new_privs() -> PrimitiveStatus { + let ret = unsafe { prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) }; + if ret == 0 { + PrimitiveStatus::Applied + } else { + PrimitiveStatus::Failed(last_errno()) + } +} + +fn apply_unshare() -> PrimitiveStatus { + // CLONE_NEWUSER must come first on most modern kernels so the + // unprivileged caller can map uid/gid; CLONE_NEWPID + CLONE_NEWNS + // then succeed because the new user namespace owns them. + let flags = CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNS; + let ret = unsafe { unshare(flags) }; + if ret == 0 { + PrimitiveStatus::Applied + } else { + PrimitiveStatus::Failed(last_errno()) + } +} + +fn apply_chroot(workdir: &[u8]) -> PrimitiveStatus { + // `workdir` is NUL-terminated by `canonicalize_workdir` so we can + // hand the bytes straight to `chroot(2)` without allocating in + // pre_exec. + let ret = unsafe { chroot(workdir.as_ptr() as *const i8) }; + if ret != 0 { + return PrimitiveStatus::Failed(last_errno()); + } + let root = b"/\0"; + let ret = unsafe { chdir(root.as_ptr() as *const i8) }; + if ret != 0 { + return PrimitiveStatus::Failed(last_errno()); + } + PrimitiveStatus::Applied +} + +/// Install a pre-compiled seccomp BPF filter on the calling thread. +/// +/// `program` is a heap-allocated BPF instruction array compiled in the +/// parent (`build_plan`) and shared via `Arc` so the child does not have +/// to allocate during pre_exec. +fn apply_seccomp(program: &[SockFilter]) -> PrimitiveStatus { + match seccomp::install_compiled_filter(program) { + Ok(()) => PrimitiveStatus::Applied, + Err(e) => PrimitiveStatus::Failed(e.raw_os_error().unwrap_or(-1)), + } +} + +// ── Pre-exec installer ─────────────────────────────────────────────────────── + +#[derive(Clone)] +struct PreExecPlan { + rlimit_cpu_seconds: u64, + rlimit_nofile: u64, + rlimit_as_bytes: u64, + workdir_nul: Vec, + /// Pre-compiled BPF program for the requested cap-bits. Built in + /// the parent so the child's pre_exec callback never touches the + /// allocator. + seccomp_program: Arc>, + profile: ProcessHardeningProfileTag, +} + +/// Returned by [`install_pre_exec`]. The caller MUST invoke either +/// [`OutcomeCollector::after_spawn`] or [`OutcomeCollector::forget`] +/// after `cmd.spawn()` returns — the parent's write-fd has to close so +/// the read end sees EOF and the drain thread terminates. +pub struct OutcomeCollector { + write_fd: RawFd, + read_fd: RawFd, +} + +/// Background-drain handle returned by [`OutcomeCollector::after_spawn`]. +/// `run_process` awaits this after `child.wait()` so the outcome is +/// guaranteed to be in the registry before the function returns; tests +/// that bypass `run_process` can call [`OutcomeJoiner::await_outcome`] +/// themselves. +pub struct OutcomeJoiner { + handle: Option>, +} + +impl OutcomeJoiner { + /// Block until the drain thread finishes recording the outcome. + pub fn await_outcome(mut self) { + if let Some(h) = self.handle.take() { + let _ = h.join(); + } + } +} + +impl Drop for OutcomeJoiner { + fn drop(&mut self) { + if let Some(h) = self.handle.take() { + let _ = h.join(); + } + } +} + +impl OutcomeCollector { + /// Call after `cmd.spawn()` returns `Ok`. Closes the parent's copy + /// of the write fd so the kernel ref-count drops to whatever the + /// child is still holding; once execve(2) closes the child's + /// O_CLOEXEC copy too, the read end sees EOF and the drain thread + /// records the outcome via [`record_outcome`]. Returns a join + /// handle the caller can await to know the outcome is settled. + pub fn after_spawn(self) -> OutcomeJoiner { + close_fd(self.write_fd); + let read_fd = self.read_fd; + let handle = std::thread::spawn(move || { + if let Some(outcome) = drain_outcome(read_fd) { + record_outcome(outcome); + } + }); + OutcomeJoiner { handle: Some(handle) } + } + + /// Call when `cmd.spawn()` failed. Closes both ends so neither fd + /// leaks; no outcome is recorded. + pub fn forget(self) { + close_fd(self.write_fd); + close_fd(self.read_fd); + } +} + +/// Install the Phase 17 hardening sequence on `cmd`. +/// +/// Returns `Some(collector)` when the status pipe was successfully +/// created; the caller must invoke +/// [`OutcomeCollector::after_spawn`] after a successful `cmd.spawn()`. +/// Returns `None` when pipe creation itself failed (rare: +/// `EMFILE`/`ENFILE`). In that case the pre_exec hook is still +/// installed — the child still gets the full hardening sequence — but +/// the per-primitive outcome cannot be reported back to the parent. +pub fn install_pre_exec( + cmd: &mut Command, + opts: &SandboxOptions, + workdir: &Path, +) -> Option { + let plan = build_plan(opts, workdir); + + let pipe = StatusPipe::new().ok(); + let write_fd = pipe.as_ref().map(|p| p.write_fd).unwrap_or(-1); + let read_fd = pipe.as_ref().map(|p| p.read_fd); + let plan_for_child = plan.clone(); + + // Safety: pre_exec runs after fork(2) and before execve(2). We must + // not allocate, take any locks, or call into the Rust runtime. The + // captured `plan_for_child` is moved in; reading its already-allocated + // fields is safe because no allocator call is needed. + unsafe { + cmd.pre_exec(move || { + let outcome = run_pre_exec_in_child(&plan_for_child); + if write_fd >= 0 { + let bytes = encode_outcome(&outcome); + let _ = write(write_fd, bytes.as_ptr(), bytes.len()); + // execve(2) closes write_fd via O_CLOEXEC; no manual + // close needed here. + } + Ok(()) + }); + } + read_fd.map(|read_fd| OutcomeCollector { write_fd, read_fd }) +} + +fn run_pre_exec_in_child(plan: &PreExecPlan) -> HardeningOutcome { + let mut outcome = HardeningOutcome::default(); + outcome.profile = plan.profile; + + // ── Always-on: PR_SET_NO_NEW_PRIVS + RLIMIT_AS ─────────────────────── + outcome.no_new_privs = apply_no_new_privs(); + outcome.rlimit_as = apply_rlimit(RLIMIT_AS, plan.rlimit_as_bytes); + + if matches!(plan.profile, ProcessHardeningProfileTag::Standard) { + return outcome; + } + + // ── Strict profile: rlimits, unshare, chroot, seccomp ──────────────── + outcome.rlimit_cpu = apply_rlimit(RLIMIT_CPU, plan.rlimit_cpu_seconds); + outcome.rlimit_nofile = apply_rlimit(RLIMIT_NOFILE, plan.rlimit_nofile); + outcome.unshare = apply_unshare(); + outcome.chroot = apply_chroot(&plan.workdir_nul); + // seccomp is applied last so the filter does not block any of the + // earlier syscalls (setrlimit, prctl, unshare, chroot, chdir). + outcome.seccomp = apply_seccomp(plan.seccomp_program.as_slice()); + + outcome +} + +fn build_plan(opts: &SandboxOptions, workdir: &Path) -> PreExecPlan { + let memory_mib = opts.memory_mib; + let cap_mib = memory_mib.saturating_mul(8).max(4096); + let rlimit_as_bytes = cap_mib.saturating_mul(1024 * 1024); + + let timeout_secs = opts.timeout.as_secs().max(1); + let rlimit_cpu_seconds = timeout_secs.saturating_mul(2).max(2); + + let workdir_nul = canonicalize_workdir(workdir); + + // Pre-compile the BPF program in the parent so the pre_exec + // callback (which must not allocate) can hand it straight to + // `prctl(PR_SET_SECCOMP)`. + let nrs = seccomp::allowed_syscall_numbers(opts.seccomp_caps); + let program = seccomp::bpf::compile(&nrs, seccomp::syscalls::AUDIT_ARCH); + + PreExecPlan { + rlimit_cpu_seconds, + rlimit_nofile: 256, + rlimit_as_bytes, + workdir_nul, + seccomp_program: Arc::new(program), + profile: match opts.process_hardening { + ProcessHardeningProfile::Standard => ProcessHardeningProfileTag::Standard, + ProcessHardeningProfile::Strict => ProcessHardeningProfileTag::Strict, + }, + } +} + +fn canonicalize_workdir(workdir: &Path) -> Vec { + let canonical: PathBuf = std::fs::canonicalize(workdir).unwrap_or_else(|_| workdir.to_path_buf()); + let mut bytes = canonical.into_os_string().into_encoded_bytes(); + if !bytes.ends_with(&[0]) { + bytes.push(0); + } + bytes +} + +// ── Tests ──────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn outcome_codec_round_trip_strict_full() { + let out = HardeningOutcome { + no_new_privs: PrimitiveStatus::Applied, + rlimit_cpu: PrimitiveStatus::Applied, + rlimit_nofile: PrimitiveStatus::Applied, + rlimit_as: PrimitiveStatus::Applied, + unshare: PrimitiveStatus::Applied, + chroot: PrimitiveStatus::Applied, + seccomp: PrimitiveStatus::Applied, + profile: ProcessHardeningProfileTag::Strict, + }; + let bytes = encode_outcome(&out); + let decoded = decode_outcome(&bytes).expect("decode"); + assert_eq!(decoded, out); + assert_eq!(decoded.level(), HardeningLevel::Full); + } + + #[test] + fn outcome_codec_round_trip_partial() { + let out = HardeningOutcome { + no_new_privs: PrimitiveStatus::Applied, + rlimit_cpu: PrimitiveStatus::Applied, + rlimit_nofile: PrimitiveStatus::Failed(13), + rlimit_as: PrimitiveStatus::Applied, + unshare: PrimitiveStatus::Failed(1), + chroot: PrimitiveStatus::Failed(13), + seccomp: PrimitiveStatus::Applied, + profile: ProcessHardeningProfileTag::Strict, + }; + let bytes = encode_outcome(&out); + let decoded = decode_outcome(&bytes).expect("decode"); + assert_eq!(decoded, out); + assert_eq!(decoded.level(), HardeningLevel::Partial); + } + + #[test] + fn standard_profile_reports_baseline_level() { + let out = HardeningOutcome { + no_new_privs: PrimitiveStatus::Applied, + rlimit_as: PrimitiveStatus::Applied, + profile: ProcessHardeningProfileTag::Standard, + ..HardeningOutcome::default() + }; + assert_eq!(out.level(), HardeningLevel::Baseline); + } + + #[test] + fn build_plan_pads_workdir_with_nul() { + let opts = SandboxOptions::default(); + let plan = build_plan(&opts, std::path::Path::new("/tmp")); + assert!(plan.workdir_nul.ends_with(&[0])); + assert_eq!(plan.profile, ProcessHardeningProfileTag::Standard); + } + + #[test] + fn build_plan_strict_compiles_seccomp_program() { + let opts = SandboxOptions { + seccomp_caps: 0xff, + process_hardening: ProcessHardeningProfile::Strict, + ..SandboxOptions::default() + }; + let plan = build_plan(&opts, std::path::Path::new("/tmp")); + // The arch check + ld nr + KILL + ALLOW alone are 5 instructions; + // the BASE allowlist adds dozens more. + assert!(plan.seccomp_program.len() > 5, "BPF program too small: {}", plan.seccomp_program.len()); + assert_eq!(plan.profile, ProcessHardeningProfileTag::Strict); + } + + #[test] + fn rlimit_as_bytes_floors_at_4_gib() { + let opts = SandboxOptions { memory_mib: 1, ..SandboxOptions::default() }; + let plan = build_plan(&opts, std::path::Path::new("/tmp")); + assert_eq!(plan.rlimit_as_bytes, 4096_u64 * 1024 * 1024); + } + + #[test] + fn rlimit_as_bytes_scales_with_memory_mib() { + let opts = SandboxOptions { memory_mib: 1024, ..SandboxOptions::default() }; + let plan = build_plan(&opts, std::path::Path::new("/tmp")); + // 1024 MiB * 8 = 8192 MiB + assert_eq!(plan.rlimit_as_bytes, 8192_u64 * 1024 * 1024); + } + + #[test] + fn truncated_buffer_decodes_to_none() { + assert!(decode_outcome(&[]).is_none()); + assert!(decode_outcome(&[0_u8; OUTCOME_LEN - 1]).is_none()); + } + + #[test] + fn record_and_reset_round_trip() { + let original = last_hardening_outcome(); + let probe = HardeningOutcome { + no_new_privs: PrimitiveStatus::Applied, + profile: ProcessHardeningProfileTag::Strict, + ..HardeningOutcome::default() + }; + record_outcome(probe); + assert_eq!(last_hardening_outcome(), Some(probe)); + reset_last_hardening_outcome(); + assert!(last_hardening_outcome().is_none()); + if let Some(prev) = original { + record_outcome(prev); + } + } +} diff --git a/src/dynamic/sandbox/seccomp/bpf.rs b/src/dynamic/sandbox/seccomp/bpf.rs new file mode 100644 index 00000000..039b5f3d --- /dev/null +++ b/src/dynamic/sandbox/seccomp/bpf.rs @@ -0,0 +1,173 @@ +//! Hand-rolled BPF program emitter for seccomp filters. +//! +//! BPF instruction format from ``: +//! +//! ```text +//! struct sock_filter { u16 code; u8 jt; u8 jf; u32 k; } +//! ``` +//! +//! Only the ops Nyx needs to implement an AUDIT_ARCH check + per-syscall +//! allowlist are defined. The output array is fed straight into +//! `prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &program)`. + +#[repr(C)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct SockFilter { + pub code: u16, + pub jt: u8, + pub jf: u8, + pub k: u32, +} + +#[repr(C)] +pub struct SockFprog { + pub len: u16, + pub filter: *const SockFilter, +} + +// BPF opcode constants — see `linux/bpf_common.h`. +pub const BPF_LD: u16 = 0x00; +pub const BPF_W: u16 = 0x00; +pub const BPF_ABS: u16 = 0x20; +pub const BPF_JMP: u16 = 0x05; +pub const BPF_JEQ: u16 = 0x10; +pub const BPF_K: u16 = 0x00; +pub const BPF_RET: u16 = 0x06; + +// seccomp action constants — see `linux/seccomp.h`. +pub const SECCOMP_RET_KILL_PROCESS: u32 = 0x8000_0000; +pub const SECCOMP_RET_KILL: u32 = 0x0000_0000; +pub const SECCOMP_RET_ALLOW: u32 = 0x7fff_0000; +pub const SECCOMP_RET_ERRNO: u32 = 0x0005_0000; + +// Offsets into `struct seccomp_data` from `linux/seccomp.h`: +// nr (s32) at offset 0 +// arch (u32) at offset 4 +pub const SECCOMP_DATA_NR: u32 = 0; +pub const SECCOMP_DATA_ARCH: u32 = 4; + +/// Emit a BPF program implementing: +/// +/// 1. Load `arch` from `seccomp_data`; if it does not match +/// `audit_arch`, kill the process. +/// 2. Load `nr` from `seccomp_data`. +/// 3. For each `allowed_nr` in the table, jump to the ALLOW return. +/// 4. Default: return KILL_PROCESS (or KILL on older kernels). +/// +/// The instruction count is `5 + allowed_nrs.len()` (plus one for the +/// final ALLOW return). Linux caps seccomp programs at 4096 +/// instructions; the realistic cap-per-finding allowlist is well under +/// 100. +pub fn compile(allowed_nrs: &[u32], audit_arch: u32) -> Vec { + let mut program: Vec = Vec::with_capacity(allowed_nrs.len() + 8); + + // (0) ld [arch] + program.push(SockFilter { + code: BPF_LD | BPF_W | BPF_ABS, + jt: 0, + jf: 0, + k: SECCOMP_DATA_ARCH, + }); + // (1) jeq audit_arch ? next : KILL + // KILL is at the very end; computed below after we know the size. + let arch_check_idx = program.len(); + program.push(SockFilter { code: BPF_JMP | BPF_JEQ | BPF_K, jt: 0, jf: 0, k: audit_arch }); + + // (2) ld [nr] + program.push(SockFilter { + code: BPF_LD | BPF_W | BPF_ABS, + jt: 0, + jf: 0, + k: SECCOMP_DATA_NR, + }); + + // (3..N) per-syscall jeq nr ? ALLOW : next + // ALLOW is two instructions before KILL (we lay out: + // ... checks ... + // ret KILL + // ret ALLOW + // ). Each jeq jumps `(N - i - 1) + 1` (over the remaining checks + // plus the KILL ret) to land on the ALLOW ret. Computed below. + let first_check_idx = program.len(); + for &nr in allowed_nrs { + program.push(SockFilter { code: BPF_JMP | BPF_JEQ | BPF_K, jt: 0, jf: 0, k: nr }); + } + + // (KILL) ret KILL_PROCESS + let kill_idx = program.len(); + program.push(SockFilter { + code: BPF_RET | BPF_K, + jt: 0, + jf: 0, + k: SECCOMP_RET_KILL_PROCESS, + }); + // (ALLOW) ret ALLOW + let allow_idx = program.len(); + program.push(SockFilter { code: BPF_RET | BPF_K, jt: 0, jf: 0, k: SECCOMP_RET_ALLOW }); + + // Patch arch check: jt=0 (next on match), jf=N (KILL on mismatch). + let arch_jf = (kill_idx - arch_check_idx - 1) as u8; + program[arch_check_idx].jf = arch_jf; + + // Patch each per-syscall jeq: jt = jump to ALLOW, jf = fall through. + for (i, nr_idx) in (first_check_idx..first_check_idx + allowed_nrs.len()).enumerate() { + let _ = i; + let jt = (allow_idx - nr_idx - 1) as u8; + program[nr_idx].jt = jt; + } + + program +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty_allowlist_emits_arch_check_and_kill() { + let prog = compile(&[], 0xc000_003e); + // ld arch, jeq audit_arch, ld nr, ret KILL, ret ALLOW + assert_eq!(prog.len(), 5); + assert_eq!(prog[0].k, SECCOMP_DATA_ARCH); + assert_eq!(prog[1].k, 0xc000_003e); + assert_eq!(prog[2].k, SECCOMP_DATA_NR); + assert_eq!(prog[3].k, SECCOMP_RET_KILL_PROCESS); + assert_eq!(prog[4].k, SECCOMP_RET_ALLOW); + } + + #[test] + fn single_syscall_allows_its_nr() { + let prog = compile(&[42], 0xc000_003e); + // ld arch, jeq audit_arch, ld nr, jeq 42, ret KILL, ret ALLOW + assert_eq!(prog.len(), 6); + let jeq = prog[3]; + assert_eq!(jeq.code, BPF_JMP | BPF_JEQ | BPF_K); + assert_eq!(jeq.k, 42); + // jt jumps over the KILL ret (1 inst) to land on ALLOW + assert_eq!(jeq.jt, 1); + assert_eq!(prog[4].k, SECCOMP_RET_KILL_PROCESS); + assert_eq!(prog[5].k, SECCOMP_RET_ALLOW); + } + + #[test] + fn multi_syscall_jt_offsets_chain_to_allow() { + let prog = compile(&[1, 2, 3], 0xc000_003e); + // ld arch, jeq audit_arch, ld nr, jeq 1, jeq 2, jeq 3, KILL, ALLOW + assert_eq!(prog.len(), 8); + // jeq 1 at idx 3 → ALLOW at idx 7 → jt=7-3-1=3 + assert_eq!(prog[3].jt, 3); + // jeq 2 at idx 4 → jt=7-4-1=2 + assert_eq!(prog[4].jt, 2); + // jeq 3 at idx 5 → jt=7-5-1=1 + assert_eq!(prog[5].jt, 1); + } + + #[test] + fn arch_mismatch_jumps_to_kill() { + let prog = compile(&[1, 2], 0xc000_003e); + // ld arch (0), jeq arch (1), ld nr (2), jeq 1 (3), jeq 2 (4), KILL (5), ALLOW (6) + // arch jeq jf must point to KILL → jf=5-1-1=3 + assert_eq!(prog[1].jf, 3); + assert_eq!(prog[5].k, SECCOMP_RET_KILL_PROCESS); + } +} diff --git a/src/dynamic/sandbox/seccomp/mod.rs b/src/dynamic/sandbox/seccomp/mod.rs new file mode 100644 index 00000000..00e6f8b9 --- /dev/null +++ b/src/dynamic/sandbox/seccomp/mod.rs @@ -0,0 +1,179 @@ +//! Phase 17 (Track E.1) — seccomp-bpf default-deny filter. +//! +//! [`apply_for_caps`] composes the cap-tagged allowlist baked from +//! `seccomp_policy.toml` (via `build.rs`) into a BPF program and installs +//! it via `prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &program)`. The +//! filter is per-thread and inherited across `execve`, so the harness +//! runs under it from the very first instruction of its image. +//! +//! Layout +//! ------ +//! - `seccomp_policy.toml` — declarative cap → syscall table (the source +//! of truth). `build.rs` parses it and emits an inline-includable Rust +//! table to `OUT_DIR/seccomp_policy.rs`. +//! - `bpf.rs` — minimal BPF instruction emitter (`compile()` returns a +//! `Vec`). +//! - `syscalls.rs` — name → number map, x86_64 / aarch64. +//! +//! Design choices +//! -------------- +//! - Default action is `SECCOMP_RET_KILL_PROCESS` so a denied syscall +//! takes the whole harness down (loud failure, easy to tell apart from +//! a normal sink hit). +//! - Unknown syscall names from the policy are silently dropped — they +//! can't be filtered without a number, and any kernel that recognises +//! the name has the number too. Tests assert the policy round-trips. + +pub mod bpf; +pub mod syscalls; + +use std::collections::BTreeSet; + +use crate::dynamic::sandbox::seccomp::bpf::{compile, SockFilter, SockFprog}; +use crate::dynamic::sandbox::seccomp::syscalls::{syscall_number, AUDIT_ARCH}; + +include!(concat!(env!("OUT_DIR"), "/seccomp_policy.rs")); + +const PR_SET_NO_NEW_PRIVS: i32 = 38; +const PR_SET_SECCOMP: i32 = 22; +const SECCOMP_MODE_FILTER: u64 = 2; + +unsafe extern "C" { + fn prctl(option: i32, arg2: u64, arg3: u64, arg4: u64, arg5: u64) -> i32; + fn __errno_location() -> *mut i32; +} + +/// Compose the cap-aware syscall allowlist: the `BASE` set unconditionally +/// + every `CAP[i]` whose bit is set in `caps`. Names are deduped via a +/// `BTreeSet` and resolved to numbers via [`syscall_number`]. Unknown +/// names (not in the per-arch table) are silently dropped. +pub fn allowed_syscall_numbers(caps: u32) -> Vec { + let mut names: BTreeSet<&'static str> = BTreeSet::new(); + for &n in BASE.iter() { + names.insert(n); + } + for &(bit, allowlist) in CAP.iter() { + if caps & bit != 0 { + for &n in allowlist.iter() { + names.insert(n); + } + } + } + let mut nrs: Vec = names.into_iter().filter_map(syscall_number).collect(); + nrs.sort_unstable(); + nrs.dedup(); + nrs +} + +/// Install a pre-compiled seccomp filter on the calling thread. +/// +/// `program` MUST come from [`bpf::compile`]. Calls +/// `prctl(PR_SET_NO_NEW_PRIVS)` first (a kernel prerequisite for +/// unprivileged seccomp filter install) then +/// `prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)`. Returns the +/// underlying `io::Error` on failure. +/// +/// Allocator-free: the function only borrows `program`, so the +/// hardening pre_exec callback can use it without violating the +/// post-fork allocator ban. +pub fn install_compiled_filter(program: &[SockFilter]) -> std::io::Result<()> { + if AUDIT_ARCH == 0 || program.is_empty() { + return Ok(()); + } + + // PR_SET_NO_NEW_PRIVS = 1 is a kernel prerequisite for unprivileged + // seccomp filter install. The Phase 17 hardening sequence already + // calls it earlier, but installing here too is idempotent and + // protects direct callers. + let _ = unsafe { prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) }; + + let prog = SockFprog { + len: program.len() as u16, + filter: program.as_ptr(), + }; + let ret = unsafe { + prctl( + PR_SET_SECCOMP, + SECCOMP_MODE_FILTER, + &prog as *const SockFprog as u64, + 0, + 0, + ) + }; + if ret == 0 { + Ok(()) + } else { + Err(std::io::Error::from_raw_os_error(unsafe { + *__errno_location() + })) + } +} + +/// Convenience wrapper: compose the cap-aware allowlist via +/// [`allowed_syscall_numbers`], compile a BPF program, and install it. +/// Used by direct callers that don't pre-compile in the parent. +pub fn apply_for_caps(caps: u32) -> std::io::Result<()> { + if AUDIT_ARCH == 0 { + return Ok(()); + } + let nrs = allowed_syscall_numbers(caps); + let program: Vec = compile(&nrs, AUDIT_ARCH); + install_compiled_filter(&program) +} + +// ── Tests ──────────────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn base_table_is_non_empty() { + assert!(!BASE.is_empty(), "seccomp BASE allowlist must include stdio + startup syscalls"); + } + + #[test] + fn cap_table_includes_known_caps() { + let known: Vec<&str> = CAP + .iter() + .map(|(_, _)| "_") + .collect(); + // We declared SQL_QUERY, FILE_IO, SSRF, CODE_EXEC, HTML_ESCAPE, + // DESERIALIZE, HEADER_INJECTION, OPEN_REDIRECT in the toml; the + // build script emits one entry per `[cap.X]` table. The exact + // count can grow as the policy grows; assert ≥ 4 so a future + // accidental empty-policy regression is loud. + assert!(known.len() >= 4, "CAP table emitted: {:?}", known.len()); + } + + #[test] + fn allowlist_deduplicates_overlapping_caps() { + // SSRF and HEADER_INJECTION both allow `socket`; the deduped set + // must contain it exactly once. + let nrs = allowed_syscall_numbers(0); + let mut sorted = nrs.clone(); + sorted.sort_unstable(); + sorted.dedup(); + assert_eq!(nrs.len(), sorted.len()); + } + + #[test] + fn caps_zero_returns_only_base() { + let base = allowed_syscall_numbers(0); + let with_caps = allowed_syscall_numbers(0xffff_ffff); + assert!(base.len() <= with_caps.len()); + } + + /// `BASE` includes `read` / `write` / `close` — the minimum the + /// harness needs to print to stdout and exit cleanly. + #[test] + fn base_allows_stdio() { + let nrs = allowed_syscall_numbers(0); + let read = syscall_number("read").expect("read in syscall map"); + let write = syscall_number("write").expect("write in syscall map"); + let close = syscall_number("close").expect("close in syscall map"); + assert!(nrs.contains(&read)); + assert!(nrs.contains(&write)); + assert!(nrs.contains(&close)); + } +} diff --git a/src/dynamic/sandbox/seccomp/seccomp_policy.toml b/src/dynamic/sandbox/seccomp/seccomp_policy.toml new file mode 100644 index 00000000..f29fa708 --- /dev/null +++ b/src/dynamic/sandbox/seccomp/seccomp_policy.toml @@ -0,0 +1,216 @@ +# Phase 17 (Track E.1) — seccomp-bpf default-deny allowlist. +# +# Format +# ------ +# Each `[base]` syscall is allowed unconditionally (every harness needs +# them for stdio + interpreter / runtime startup). Each `[cap.]` +# table adds syscalls allowed only when that `Cap` bit is set in +# `SandboxOptions::seccomp_caps`. Unknown / unset caps fall back to the +# base list, so a finding with no cap-aware needs runs with the strictest +# possible filter. +# +# `` must match a `Cap::*` const declared in `src/labels/mod.rs`. +# The list of known names is mirrored in `build.rs::CAP_BIT_FOR_NAME`; +# add the bit value alongside the const when extending [`Cap`]. +# +# Build-time codegen +# ------------------ +# `build.rs` reads this file and emits `OUT_DIR/seccomp_policy.rs` +# containing two `&'static [&'static str]` tables (`BASE` + `CAP`). +# Runtime then maps the syscall names to x86_64 / aarch64 numbers via +# `syscalls.rs` and compiles a BPF program per cap-bits. + +[base] +allow = [ + "read", + "write", + "writev", + "readv", + "close", + "fstat", + "lseek", + "lstat", + "stat", + "newfstatat", + "statx", + "mmap", + "mremap", + "munmap", + "brk", + "rt_sigaction", + "rt_sigreturn", + "rt_sigprocmask", + "sigaltstack", + "exit", + "exit_group", + "futex", + "set_robust_list", + "get_robust_list", + "getrandom", + "getpid", + "gettid", + "getuid", + "geteuid", + "getgid", + "getegid", + "clock_gettime", + "clock_getres", + "clock_nanosleep", + "nanosleep", + "ioctl", + "fcntl", + "dup", + "dup2", + "dup3", + "pipe", + "pipe2", + "uname", + "arch_prctl", + "prlimit64", + "getrlimit", + "set_tid_address", + "rseq", + "madvise", + "mprotect", + "epoll_create1", + "epoll_ctl", + "epoll_wait", + "epoll_pwait", + "poll", + "ppoll", + "select", + "pselect6", + "wait4", + "waitid", + "tgkill", + "kill", + "openat", + "open", + "access", + "faccessat", + "faccessat2", + "readlink", + "readlinkat", + "getcwd", + "getdents", + "getdents64", + "sched_getaffinity", + "sched_setaffinity", + "sched_yield", + "prctl", + "membarrier", +] + +[cap.SQL_QUERY] +# SQLite / driver paths use lock + truncate + sync ops on top of the base +# openat / read / write set. +allow = [ + "fdatasync", + "fsync", + "fallocate", + "ftruncate", + "flock", + "pread64", + "pwrite64", +] + +[cap.FILE_IO] +# File reads + directory walks need the dirfd / xattr / link family on +# top of the base set. +allow = [ + "pread64", + "pwrite64", + "readlinkat", + "linkat", + "symlinkat", + "unlinkat", + "mkdirat", + "renameat", + "renameat2", + "utimensat", + "fchmod", + "fchown", + "fchmodat", + "fchownat", + "getxattr", + "fgetxattr", + "lgetxattr", + "listxattr", + "flistxattr", + "llistxattr", + "copy_file_range", + "sendfile", +] + +[cap.SSRF] +# Outbound HTTP needs the socket / connect / TLS handshake set. +allow = [ + "socket", + "connect", + "sendto", + "recvfrom", + "sendmsg", + "recvmsg", + "shutdown", + "getsockname", + "getpeername", + "getsockopt", + "setsockopt", + "bind", + "listen", + "accept", + "accept4", +] + +[cap.CODE_EXEC] +# `subprocess.run(...)` / `os.system(...)` payloads need fork + exec. +allow = [ + "clone", + "clone3", + "fork", + "vfork", + "execve", + "execveat", + "wait4", + "waitid", +] + +[cap.HTML_ESCAPE] +# Pure-CPU sanitizer paths need only the base set; this entry exists so +# the build-time codegen sees the cap and emits an explicit table even +# when the allowlist is empty. +allow = [] + +[cap.DESERIALIZE] +# pickle / Marshal / unserialize paths typically only need the base I/O +# set; codegen-only entry. +allow = [] + +[cap.HEADER_INJECTION] +# CRLF-sensitive header sinks share the SSRF socket family. +allow = [ + "socket", + "connect", + "sendto", + "recvfrom", + "sendmsg", + "recvmsg", + "getsockname", + "getpeername", + "getsockopt", + "setsockopt", +] + +[cap.OPEN_REDIRECT] +allow = [ + "socket", + "connect", + "sendto", + "recvfrom", + "sendmsg", + "recvmsg", + "getsockname", + "getpeername", + "getsockopt", + "setsockopt", +] diff --git a/src/dynamic/sandbox/seccomp/syscalls.rs b/src/dynamic/sandbox/seccomp/syscalls.rs new file mode 100644 index 00000000..a2147582 --- /dev/null +++ b/src/dynamic/sandbox/seccomp/syscalls.rs @@ -0,0 +1,291 @@ +//! Syscall name → number map for the architectures Nyx's Linux process +//! backend supports. Only the names referenced by +//! `seccomp_policy.toml` need to be present; unknown names are silently +//! dropped from the BPF allowlist (they cannot be filtered if they have +//! no number). +//! +//! Numbers are pulled from `` (x86_64) and +//! `` (aarch64). When a syscall exists on one +//! arch but not the other (e.g. `arch_prctl` on aarch64), the entry is +//! omitted on the missing arch and the seccomp filter naturally falls +//! through to the deny rule there. + +#[cfg(target_arch = "x86_64")] +pub fn syscall_number(name: &str) -> Option { + let n = match name { + "read" => 0, + "write" => 1, + "open" => 2, + "close" => 3, + "stat" => 4, + "fstat" => 5, + "lstat" => 6, + "poll" => 7, + "lseek" => 8, + "mmap" => 9, + "mprotect" => 10, + "munmap" => 11, + "brk" => 12, + "rt_sigaction" => 13, + "rt_sigprocmask" => 14, + "rt_sigreturn" => 15, + "ioctl" => 16, + "pread64" => 17, + "pwrite64" => 18, + "readv" => 19, + "writev" => 20, + "access" => 21, + "pipe" => 22, + "select" => 23, + "sched_yield" => 24, + "mremap" => 25, + "madvise" => 28, + "dup" => 32, + "dup2" => 33, + "nanosleep" => 35, + "getpid" => 39, + "sendfile" => 40, + "socket" => 41, + "connect" => 42, + "accept" => 43, + "sendto" => 44, + "recvfrom" => 45, + "sendmsg" => 46, + "recvmsg" => 47, + "shutdown" => 48, + "bind" => 49, + "listen" => 50, + "getsockname" => 51, + "getpeername" => 52, + "setsockopt" => 54, + "getsockopt" => 55, + "clone" => 56, + "fork" => 57, + "vfork" => 58, + "execve" => 59, + "exit" => 60, + "wait4" => 61, + "kill" => 62, + "uname" => 63, + "fcntl" => 72, + "flock" => 73, + "fsync" => 74, + "fdatasync" => 75, + "ftruncate" => 77, + "getdents" => 78, + "getcwd" => 79, + "readlink" => 89, + "fchmod" => 91, + "fchown" => 93, + "getuid" => 102, + "getgid" => 104, + "geteuid" => 107, + "getegid" => 108, + "sigaltstack" => 131, + "arch_prctl" => 158, + "gettid" => 186, + "futex" => 202, + "sched_setaffinity" => 203, + "sched_getaffinity" => 204, + "epoll_create" => 213, + "getdents64" => 217, + "set_tid_address" => 218, + "fadvise64" => 221, + "clock_gettime" => 228, + "clock_getres" => 229, + "clock_nanosleep" => 230, + "exit_group" => 231, + "epoll_wait" => 232, + "epoll_ctl" => 233, + "tgkill" => 234, + "waitid" => 247, + "openat" => 257, + "mkdirat" => 258, + "newfstatat" => 262, + "unlinkat" => 263, + "renameat" => 264, + "linkat" => 265, + "symlinkat" => 266, + "readlinkat" => 267, + "fchmodat" => 268, + "faccessat" => 269, + "pselect6" => 270, + "ppoll" => 271, + "fallocate" => 285, + "utimensat" => 280, + "epoll_pwait" => 281, + "accept4" => 288, + "pipe2" => 293, + "epoll_create1" => 291, + "dup3" => 292, + "prlimit64" => 302, + "getrandom" => 318, + "membarrier" => 324, + "renameat2" => 316, + "copy_file_range" => 326, + "execveat" => 322, + "rseq" => 334, + "clone3" => 435, + "faccessat2" => 439, + "statx" => 332, + "set_robust_list" => 273, + "get_robust_list" => 274, + "fchownat" => 260, + "getxattr" => 191, + "lgetxattr" => 192, + "fgetxattr" => 193, + "listxattr" => 194, + "llistxattr" => 195, + "flistxattr" => 196, + "prctl" => 157, + "getrlimit" => 97, + _ => return None, + }; + Some(n) +} + +#[cfg(target_arch = "aarch64")] +pub fn syscall_number(name: &str) -> Option { + let n = match name { + // generic numbers (asm-generic/unistd.h) + "io_setup" => 0, + "getcwd" => 17, + "lookup_dcookie" => 18, + "eventfd2" => 19, + "epoll_create1" => 20, + "epoll_ctl" => 21, + "epoll_pwait" => 22, + "dup" => 23, + "dup3" => 24, + "fcntl" => 25, + "ioctl" => 29, + "flock" => 32, + "mkdirat" => 34, + "unlinkat" => 35, + "symlinkat" => 36, + "linkat" => 37, + "renameat" => 38, + "fallocate" => 47, + "faccessat" => 48, + "chdir" => 49, + "openat" => 56, + "close" => 57, + "pipe2" => 59, + "getdents64" => 61, + "lseek" => 62, + "read" => 63, + "write" => 64, + "readv" => 65, + "writev" => 66, + "pread64" => 67, + "pwrite64" => 68, + "ppoll" => 73, + "pselect6" => 72, + "sendfile" => 71, + "fdatasync" => 83, + "fsync" => 82, + "ftruncate" => 46, + "newfstatat" => 79, + "fstat" => 80, + "exit" => 93, + "exit_group" => 94, + "waitid" => 95, + "set_tid_address" => 96, + "futex" => 98, + "set_robust_list" => 99, + "get_robust_list" => 100, + "nanosleep" => 101, + "getpid" => 172, + "gettid" => 178, + "uname" => 160, + "kill" => 129, + "tgkill" => 131, + "rt_sigaction" => 134, + "rt_sigprocmask" => 135, + "rt_sigreturn" => 139, + "sigaltstack" => 132, + "getrandom" => 278, + "membarrier" => 283, + "renameat2" => 276, + "copy_file_range" => 285, + "statx" => 291, + "execveat" => 281, + "rseq" => 293, + "clone3" => 435, + "faccessat2" => 439, + "epoll_pwait2" => 441, + "rt_sigtimedwait" => 137, + "rt_sigsuspend" => 133, + "clone" => 220, + "execve" => 221, + "mmap" => 222, + "fadvise64" => 223, + "mprotect" => 226, + "msync" => 227, + "mlock" => 228, + "munlock" => 229, + "munmap" => 215, + "brk" => 214, + "mremap" => 216, + "madvise" => 233, + "wait4" => 260, + "prlimit64" => 261, + "getrlimit" => 163, + "prctl" => 167, + "fchmod" => 52, + "fchmodat" => 53, + "fchown" => 55, + "fchownat" => 54, + "getuid" => 174, + "geteuid" => 175, + "getgid" => 176, + "getegid" => 177, + "socket" => 198, + "bind" => 200, + "listen" => 201, + "accept" => 202, + "connect" => 203, + "getsockname" => 204, + "getpeername" => 205, + "sendto" => 206, + "recvfrom" => 207, + "setsockopt" => 208, + "getsockopt" => 209, + "shutdown" => 210, + "sendmsg" => 211, + "recvmsg" => 212, + "accept4" => 242, + "sched_setaffinity" => 122, + "sched_getaffinity" => 123, + "sched_yield" => 124, + "clock_gettime" => 113, + "clock_getres" => 114, + "clock_nanosleep" => 115, + "epoll_create" => 20, // alias to epoll_create1 on generic + "epoll_wait" => 22, // alias to epoll_pwait on generic + "openat2" => 437, + "readlinkat" => 78, + "utimensat" => 88, + "getxattr" => 8, + "lgetxattr" => 9, + "fgetxattr" => 10, + "listxattr" => 11, + "llistxattr" => 12, + "flistxattr" => 13, + _ => return None, + }; + Some(n) +} + +#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] +pub fn syscall_number(_name: &str) -> Option { + None +} + +/// AUDIT_ARCH constant matching the running architecture. +#[cfg(target_arch = "x86_64")] +pub const AUDIT_ARCH: u32 = 0xc000_003e; +#[cfg(target_arch = "aarch64")] +pub const AUDIT_ARCH: u32 = 0xc000_00b7; +#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] +pub const AUDIT_ARCH: u32 = 0; diff --git a/tests/dynamic_fixtures/hardening/probe.c b/tests/dynamic_fixtures/hardening/probe.c new file mode 100644 index 00000000..da120dbf --- /dev/null +++ b/tests/dynamic_fixtures/hardening/probe.c @@ -0,0 +1,124 @@ +/* + * Phase 17 (Track E.1) — process-backend hardening probe. + * + * Linked statically (no glibc dynamic loader needed) so it runs after + * `chroot(workdir)` strips access to /usr/lib. Reads its own + * `/proc/self` view to determine which Phase 17 primitives applied, + * then prints a structured `key:value` line per primitive. The Rust + * test reads stdout and asserts on each line. + * + * The probe is also reused by the path-traversal case: when + * `argv[1] == "traverse"` it tries to open `/etc/passwd` and reports + * either `chroot blocked` (open failed) or `chroot escaped` (open + * succeeded, host file visible). + * + * Built at test runtime with `cc -static -O2 -o probe probe.c`. Test + * skips with an eprintln! when the host has no `cc` or no static glibc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static void grep_status(const char *needle, const char *fallback) { + FILE *f = fopen("/proc/self/status", "r"); + if (!f) { + printf("%s%s\n", needle, fallback); + return; + } + char line[512]; + int found = 0; + while (fgets(line, sizeof(line), f)) { + if (strncmp(line, needle, strlen(needle)) == 0) { + // Strip trailing newline. + size_t n = strlen(line); + if (n && line[n - 1] == '\n') line[n - 1] = '\0'; + printf("%s\n", line); + found = 1; + break; + } + } + if (!found) printf("%s%s\n", needle, fallback); + fclose(f); +} + +static void print_rlimit(const char *tag, int resource) { + struct rlimit rl; + if (getrlimit(resource, &rl) == 0) { + printf("%s:%llu/%llu\n", tag, + (unsigned long long)rl.rlim_cur, + (unsigned long long)rl.rlim_max); + } else { + printf("%s:err\n", tag); + } +} + +static void probe_namespaces(void) { + // /proc/self/ns/user, /proc/self/ns/pid, /proc/self/ns/mnt are + // symlinks like `user:[4026531837]`. We read the link target and + // print the inode-id portion. + const char *names[] = {"user", "pid", "mnt"}; + for (int i = 0; i < 3; i++) { + char path[64]; + char target[256]; + snprintf(path, sizeof(path), "/proc/self/ns/%s", names[i]); + ssize_t n = readlink(path, target, sizeof(target) - 1); + if (n > 0) { + target[n] = '\0'; + printf("ns_%s:%s\n", names[i], target); + } else { + printf("ns_%s:err\n", names[i]); + } + } +} + +static void probe_chroot(void) { + // After chroot(workdir), `/etc/passwd` should not exist (the harness + // workdir does not contain /etc). Open + ENOENT means chroot held. + int fd = open("/etc/passwd", O_RDONLY); + if (fd < 0) { + printf("chroot:blocked errno=%d\n", errno); + } else { + char buf[64]; + ssize_t n = read(fd, buf, sizeof(buf) - 1); + close(fd); + if (n > 0) { + buf[n] = '\0'; + printf("chroot:escaped read=%zd\n", n); + } else { + printf("chroot:escaped read=0\n"); + } + } +} + +int main(int argc, char **argv) { + grep_status("NoNewPrivs:", "\t?"); + grep_status("Seccomp:", "\t?"); + print_rlimit("rlimit_as", RLIMIT_AS); + print_rlimit("rlimit_cpu", RLIMIT_CPU); + print_rlimit("rlimit_nofile", RLIMIT_NOFILE); + probe_namespaces(); + probe_chroot(); + + if (argc > 1 && strcmp(argv[1], "traverse") == 0) { + // Path-traversal acceptance case: a payload that tries to read + // /etc/passwd outside the workdir. Exit non-zero so the verifier + // records NotConfirmed; the probe-level "chroot blocked" line + // already printed above is what the test asserts on. + if (open("/etc/passwd", O_RDONLY) >= 0) { + // chroot did not hold — exit 0 to signal escape (test fails). + printf("traverse:escaped\n"); + return 0; + } + printf("traverse:blocked\n"); + return 7; + } + + printf("__NYX_PROBE_DONE__\n"); + return 0; +} diff --git a/tests/dynamic_sandbox_escape.rs b/tests/dynamic_sandbox_escape.rs index a55ed274..746412ff 100644 --- a/tests/dynamic_sandbox_escape.rs +++ b/tests/dynamic_sandbox_escape.rs @@ -58,12 +58,8 @@ mod escape_tests { timeout: Duration::from_secs(10), memory_mib: 256, backend: SandboxBackend::Docker, - env_passthrough: vec![], - output_limit: 65536, network_policy: NetworkPolicy::None, - probe_channel: None, - extra_env: vec![], - stub_harness: None, + ..SandboxOptions::default() } } diff --git a/tests/sandbox_hardening_linux.rs b/tests/sandbox_hardening_linux.rs new file mode 100644 index 00000000..7f77b33c --- /dev/null +++ b/tests/sandbox_hardening_linux.rs @@ -0,0 +1,478 @@ +//! Phase 17 (Track E.1) — Linux process backend hardening acceptance tests. +//! +//! Each primitive in the Phase 17 sequence is exercised against a +//! statically-linked C probe (`tests/dynamic_fixtures/hardening/probe.c`) +//! that prints its own `/proc/self` view to stdout. The Rust test reads +//! stdout back and asserts on the expected line per primitive. +//! +//! The probe is built once per test run via `cc -static -O2`. Hosts +//! without `cc` or without a static-link-capable libc skip with an +//! `eprintln!` rather than failing — the suite's authoritative gate is +//! the Linux CI matrix row that has both. +//! +//! Run with: +//! `cargo nextest run --features dynamic --test sandbox_hardening_linux` + +#[cfg(all(feature = "dynamic", target_os = "linux"))] +mod hardening_tests { + use std::path::{Path, PathBuf}; + use std::process::Command; + use std::sync::OnceLock; + use std::time::Duration; + + use nyx_scanner::dynamic::harness::BuiltHarness; + use nyx_scanner::dynamic::sandbox::process_linux::{ + last_hardening_outcome, reset_last_hardening_outcome, HardeningLevel, PrimitiveStatus, + }; + use nyx_scanner::dynamic::sandbox::seccomp; + use nyx_scanner::dynamic::sandbox::{ + self, ProcessHardeningProfile, SandboxBackend, SandboxOptions, + }; + + // ── Probe build ─────────────────────────────────────────────────────────── + + /// Path to the freshly-built probe binary, shared across every test. + static PROBE_BINARY: OnceLock> = OnceLock::new(); + + fn probe_path() -> Option<&'static Path> { + PROBE_BINARY + .get_or_init(|| build_probe_once()) + .as_deref() + } + + fn build_probe_once() -> Option { + let cc = std::env::var("CC").unwrap_or_else(|_| "cc".to_owned()); + let src = PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests/dynamic_fixtures/hardening/probe.c"); + let out_dir = std::env::temp_dir().join("nyx-hardening-probe"); + let _ = std::fs::create_dir_all(&out_dir); + let out_bin = out_dir.join("probe"); + + // Try a static link first (works under glibc-dev with libc.a, or + // musl-cross). Fall back to dynamic if that fails — the probe + // still functions before chroot but the chroot test will skip. + let static_status = Command::new(&cc) + .args(["-static", "-O2", "-o"]) + .arg(&out_bin) + .arg(&src) + .status(); + if matches!(&static_status, Ok(s) if s.success()) { + return Some(out_bin); + } + + let dyn_status = Command::new(&cc) + .args(["-O2", "-o"]) + .arg(&out_bin) + .arg(&src) + .status(); + if matches!(&dyn_status, Ok(s) if s.success()) { + // Mark via env so the chroot test can branch. + unsafe { std::env::set_var("NYX_PROBE_DYNAMIC", "1") }; + return Some(out_bin); + } + + eprintln!( + "SKIP: could not build hardening probe with {cc:?} (static={static_status:?}, \ + dyn={dyn_status:?})" + ); + None + } + + fn probe_is_static() -> bool { + std::env::var_os("NYX_PROBE_DYNAMIC").is_none() + } + + // ── Sandbox helpers ─────────────────────────────────────────────────────── + + fn strict_opts() -> SandboxOptions { + SandboxOptions { + timeout: Duration::from_secs(10), + memory_mib: 256, + backend: SandboxBackend::Process, + output_limit: 65536, + process_hardening: ProcessHardeningProfile::Strict, + // Keep seccomp_caps = 0 so only the BASE allowlist applies: + // the probe needs `read`, `write`, `openat`, `readlink`, etc., + // all of which are in the base set. + seccomp_caps: 0, + ..SandboxOptions::default() + } + } + + fn standard_opts() -> SandboxOptions { + SandboxOptions { + timeout: Duration::from_secs(10), + memory_mib: 256, + backend: SandboxBackend::Process, + output_limit: 65536, + process_hardening: ProcessHardeningProfile::Standard, + ..SandboxOptions::default() + } + } + + fn build_harness_with_probe(workdir: &Path, args: &[&str]) -> BuiltHarness { + // Stage the probe inside the workdir so `chroot(workdir)` doesn't + // leave the binary unreachable mid-exec. + let probe_src = probe_path().expect("probe must be built").to_path_buf(); + let probe_dst = workdir.join("probe"); + std::fs::copy(&probe_src, &probe_dst).expect("copy probe into workdir"); + // Ensure it's executable (cc preserves +x but be explicit). + use std::os::unix::fs::PermissionsExt; + let mut perms = std::fs::metadata(&probe_dst).unwrap().permissions(); + perms.set_mode(0o755); + std::fs::set_permissions(&probe_dst, perms).unwrap(); + + let mut command: Vec = vec![probe_dst.to_string_lossy().into_owned()]; + for a in args { + command.push((*a).to_string()); + } + + BuiltHarness { + workdir: workdir.to_path_buf(), + command, + env: vec![], + source: String::new(), + entry_source: String::new(), + } + } + + fn workdir() -> tempfile::TempDir { + tempfile::TempDir::new().expect("temp dir") + } + + fn stdout_string(out: &sandbox::SandboxOutcome) -> String { + String::from_utf8_lossy(&out.stdout).into_owned() + } + + fn assert_line(stdout: &str, prefix: &str) { + assert!( + stdout.lines().any(|l| l.starts_with(prefix)), + "expected stdout to contain a line starting with {prefix:?}; full stdout:\n{stdout}" + ); + } + + // ── Tests ───────────────────────────────────────────────────────────────── + + /// Sanity gate: the probe must build and run on a Confirmed + /// (exit-zero) baseline. All other tests presume this passes. + #[test] + fn probe_runs_under_strict_profile() { + let Some(_) = probe_path() else { return }; + let tmp = workdir(); + let harness = build_harness_with_probe(tmp.path(), &[]); + let opts = strict_opts(); + reset_last_hardening_outcome(); + let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run"); + let stdout = stdout_string(&result); + eprintln!("probe stdout under strict:\n{stdout}"); + // Probe always prints a `__NYX_PROBE_DONE__` sentinel after the + // primitive lines; absence means the binary died before reaching + // the end (e.g. seccomp killed it). A clean Confirmed run prints + // it. + assert_line(&stdout, "__NYX_PROBE_DONE__"); + } + + #[test] + fn no_new_privs_set_under_strict() { + let Some(_) = probe_path() else { return }; + let tmp = workdir(); + let harness = build_harness_with_probe(tmp.path(), &[]); + let opts = strict_opts(); + let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run"); + let stdout = stdout_string(&result); + // /proc/self/status's `NoNewPrivs:` line is `1` after PR_SET_NO_NEW_PRIVS. + assert!( + stdout.contains("NoNewPrivs:\t1"), + "expected NoNewPrivs:1 line; full stdout:\n{stdout}" + ); + } + + #[test] + fn rlimit_cpu_capped_under_strict() { + let Some(_) = probe_path() else { return }; + let tmp = workdir(); + let harness = build_harness_with_probe(tmp.path(), &[]); + let opts = strict_opts(); + let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run"); + let stdout = stdout_string(&result); + // RLIMIT_CPU is set to timeout * 2 = 20 seconds in strict_opts. + // Under Standard the value would be RLIM_INFINITY. + assert_line(&stdout, "rlimit_cpu:"); + for line in stdout.lines() { + if let Some(rest) = line.strip_prefix("rlimit_cpu:") { + let (cur, _) = rest.split_once('/').expect("rlimit_cpu format"); + let cur: u64 = cur.parse().expect("numeric rlimit"); + assert!(cur <= 30, "RLIMIT_CPU not capped: {cur}"); + return; + } + } + panic!("rlimit_cpu line missing from stdout:\n{stdout}"); + } + + #[test] + fn rlimit_nofile_capped_under_strict() { + let Some(_) = probe_path() else { return }; + let tmp = workdir(); + let harness = build_harness_with_probe(tmp.path(), &[]); + let opts = strict_opts(); + let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run"); + let stdout = stdout_string(&result); + for line in stdout.lines() { + if let Some(rest) = line.strip_prefix("rlimit_nofile:") { + let (cur, _) = rest.split_once('/').expect("rlimit_nofile format"); + let cur: u64 = cur.parse().expect("numeric rlimit"); + assert!(cur <= 256, "RLIMIT_NOFILE not capped: {cur}"); + return; + } + } + panic!("rlimit_nofile line missing from stdout:\n{stdout}"); + } + + #[test] + fn rlimit_as_capped_under_strict() { + let Some(_) = probe_path() else { return }; + let tmp = workdir(); + let harness = build_harness_with_probe(tmp.path(), &[]); + let opts = strict_opts(); + let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run"); + let stdout = stdout_string(&result); + for line in stdout.lines() { + if let Some(rest) = line.strip_prefix("rlimit_as:") { + let (cur, _) = rest.split_once('/').expect("rlimit_as format"); + let cur: u64 = cur.parse().expect("numeric rlimit"); + // memory_mib=256 → cap = max(256*8, 4096) MiB = 4 GiB + let four_gib = 4_u64 * 1024 * 1024 * 1024; + assert_eq!(cur, four_gib, "RLIMIT_AS not 4 GiB: {cur}"); + return; + } + } + panic!("rlimit_as line missing from stdout:\n{stdout}"); + } + + /// `unshare(CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWNS)` is best-effort. + /// On hosts that allow unprivileged user namespaces the probe's + /// `/proc/self/ns/user` inode differs from the parent's; on locked- + /// down hosts (sysctl `kernel.unprivileged_userns_clone=0`) the + /// outcome decays to `Partial` instead of failing the run. + #[test] + fn unshare_namespaces_when_kernel_allows() { + let Some(_) = probe_path() else { return }; + let tmp = workdir(); + let harness = build_harness_with_probe(tmp.path(), &[]); + let opts = strict_opts(); + reset_last_hardening_outcome(); + let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run"); + let stdout = stdout_string(&result); + let outcome = last_hardening_outcome().expect("hardening outcome recorded"); + + // Parent's user-ns inode for comparison. + let parent_user_ns = + std::fs::read_link("/proc/self/ns/user").map(|p| p.to_string_lossy().into_owned()); + + match outcome.unshare { + PrimitiveStatus::Applied => { + let probe_user_ns_line = stdout + .lines() + .find(|l| l.starts_with("ns_user:")) + .expect("ns_user: line in stdout"); + if let Ok(parent) = parent_user_ns { + assert!( + !probe_user_ns_line.contains(parent.as_str()), + "child user ns identical to parent — unshare reported Applied but ns inode unchanged" + ); + } + } + PrimitiveStatus::Failed(errno) => { + eprintln!( + "unshare returned errno={errno} (likely unprivileged_userns_clone=0); \ + accepting Partial level" + ); + assert!(matches!( + outcome.level(), + HardeningLevel::Partial | HardeningLevel::None + )); + } + PrimitiveStatus::Skipped => panic!("unshare must not be Skipped under Strict profile"), + } + } + + /// `chroot` should make the host's `/etc/passwd` unreachable from + /// inside the harness. Under the Strict profile and a static probe + /// the file open returns ENOENT and the probe prints + /// `chroot:blocked`. + #[test] + fn chroot_blocks_etc_passwd() { + let Some(_) = probe_path() else { return }; + if !probe_is_static() { + eprintln!("SKIP: probe is dynamically linked — chroot would block its loader before main()"); + return; + } + let tmp = workdir(); + let harness = build_harness_with_probe(tmp.path(), &[]); + let opts = strict_opts(); + reset_last_hardening_outcome(); + let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run"); + let stdout = stdout_string(&result); + let outcome = last_hardening_outcome().expect("hardening outcome recorded"); + + match outcome.chroot { + PrimitiveStatus::Applied => { + assert!( + stdout.contains("chroot:blocked"), + "chroot reported Applied but /etc/passwd was readable; full stdout:\n{stdout}" + ); + } + PrimitiveStatus::Failed(errno) => { + // Common failure: EPERM when the kernel blocks chroot + // for unprivileged callers without CAP_SYS_CHROOT, or + // EINVAL when the workdir doesn't satisfy the + // canonicalisation precondition. Accept Partial. + eprintln!("chroot returned errno={errno}; recorded as Partial"); + assert_ne!(outcome.level(), HardeningLevel::Full); + } + PrimitiveStatus::Skipped => panic!("chroot must not be Skipped under Strict profile"), + } + } + + /// Path-traversal acceptance case from the phase deliverables. + /// Drives the probe with `traverse` so it tries to open + /// `/etc/passwd`; the binary exits non-zero on chroot success + /// (mapped to `NotConfirmed` by the runner's exit-code rule) and + /// prints `chroot blocked` for the test to assert on. + #[test] + fn path_traversal_returns_not_confirmed_when_chroot_holds() { + let Some(_) = probe_path() else { return }; + if !probe_is_static() { + eprintln!("SKIP: probe is dynamically linked — chroot test requires static link"); + return; + } + let tmp = workdir(); + let harness = build_harness_with_probe(tmp.path(), &["traverse"]); + let opts = strict_opts(); + reset_last_hardening_outcome(); + let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run"); + let stdout = stdout_string(&result); + let outcome = last_hardening_outcome().expect("hardening outcome recorded"); + + if matches!(outcome.chroot, PrimitiveStatus::Applied) { + // NotConfirmed shape: the verifier maps a non-zero exit + no + // sink-hit sentinel to NotConfirmed. We assert the two + // structural pieces here directly. + assert_eq!( + result.exit_code, + Some(7), + "probe exit code mismatch — full stdout:\n{stdout}" + ); + assert!( + !result.sink_hit, + "sink hit should be absent on a traversal-blocked run" + ); + assert!( + stdout.contains("chroot blocked") || stdout.contains("chroot:blocked") + || stdout.contains("traverse:blocked"), + "expected `chroot blocked` marker in probe stdout; got:\n{stdout}" + ); + } else { + eprintln!( + "SKIP: chroot did not apply (status={:?}); cannot assert traversal blocked", + outcome.chroot, + ); + } + } + + /// seccomp filter installs cleanly under the Strict profile and the + /// probe survives long enough to print its sentinel. /proc/self/ + /// status's `Seccomp:` line transitions from `0` (disabled) to `2` + /// (filter mode) when the prctl call succeeds. + #[test] + fn seccomp_filter_installed_under_strict() { + let Some(_) = probe_path() else { return }; + let tmp = workdir(); + let harness = build_harness_with_probe(tmp.path(), &[]); + let opts = strict_opts(); + reset_last_hardening_outcome(); + let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run"); + let stdout = stdout_string(&result); + let outcome = last_hardening_outcome().expect("hardening outcome recorded"); + + match outcome.seccomp { + PrimitiveStatus::Applied => { + assert!( + stdout.contains("Seccomp:\t2"), + "Seccomp:2 missing — filter not active in /proc/self/status; stdout:\n{stdout}" + ); + } + PrimitiveStatus::Failed(errno) => { + eprintln!( + "SKIP: seccomp prctl returned errno={errno} (typical when running under \ + a sandbox that already locked the syscall down); accepting Partial level" + ); + assert_ne!(outcome.level(), HardeningLevel::Full); + } + PrimitiveStatus::Skipped => panic!("seccomp must not be Skipped under Strict profile"), + } + } + + /// Standard profile keeps the historical baseline: PR_SET_NO_NEW_PRIVS + /// and RLIMIT_AS only. /etc/passwd should still be readable + /// (no chroot) and the seccomp counter stays at 0. + #[test] + fn standard_profile_skips_chroot_and_seccomp() { + let Some(_) = probe_path() else { return }; + let tmp = workdir(); + let harness = build_harness_with_probe(tmp.path(), &[]); + let opts = standard_opts(); + reset_last_hardening_outcome(); + let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run"); + let stdout = stdout_string(&result); + let outcome = last_hardening_outcome().expect("hardening outcome recorded"); + + assert_eq!(outcome.level(), HardeningLevel::Baseline); + assert!(matches!(outcome.no_new_privs, PrimitiveStatus::Applied)); + assert!(matches!(outcome.rlimit_as, PrimitiveStatus::Applied)); + // None of the strict-only primitives should have been attempted. + assert!(matches!(outcome.chroot, PrimitiveStatus::Skipped)); + assert!(matches!(outcome.seccomp, PrimitiveStatus::Skipped)); + assert!(matches!(outcome.unshare, PrimitiveStatus::Skipped)); + + // Baseline: /etc/passwd should still be open-able from the host. + // The probe prints either `chroot:blocked` (if outside the + // sandbox restricted further) or `chroot:escaped`. We don't + // require either: the assertion here is purely on the recorded + // hardening outcome. + let _ = stdout; + let _ = result.exit_code; + } + + /// Seccomp policy synthesised from `seccomp_policy.toml` includes + /// the syscalls required for the probe to reach `__NYX_PROBE_DONE__` + /// (read, write, openat, readlinkat, fcntl, exit_group, …). This + /// tests the codegen path without touching the kernel. + #[test] + fn seccomp_policy_includes_essential_syscalls() { + let nrs = seccomp::allowed_syscall_numbers(0); + for essential in &["read", "write", "close", "openat", "exit_group", "fstat"] { + let nr = seccomp::syscalls::syscall_number(essential) + .unwrap_or_else(|| panic!("syscall {essential} missing from per-arch table")); + assert!( + nrs.contains(&nr), + "BASE seccomp allowlist missing essential syscall {essential} (nr={nr})" + ); + } + } +} + +// Non-Linux placeholder so `cargo nextest run --test sandbox_hardening_linux` +// doesn't fail with "no tests to run" on macOS / Windows CI rows. The real +// suite gates every test on `target_os = "linux"`. +#[cfg(not(all(feature = "dynamic", target_os = "linux")))] +mod non_linux_placeholder { + #[test] + fn linux_only_suite_skipped_on_this_target() { + eprintln!( + "SKIP: tests/sandbox_hardening_linux.rs requires `--features dynamic` and \ + target_os = linux" + ); + } +} +