mirror of
https://github.com/elicpeter/nyx.git
synced 2026-06-09 19:45:13 +02:00
[pitboss] phase 17: Track E.1 — Linux process backend hardening
This commit is contained in:
parent
a4f890797a
commit
dbad78fafa
10 changed files with 2414 additions and 68 deletions
215
build.rs
215
build.rs
|
|
@ -1,8 +1,15 @@
|
|||
use std::collections::BTreeMap;
|
||||
use std::path::Path;
|
||||
use std::process::Command;
|
||||
|
||||
fn main() {
|
||||
// Only relevant when the serve feature is active
|
||||
// Phase 17 (Track E.1): always emit the seccomp policy table to
|
||||
// OUT_DIR. Gated runtime via `#[cfg(target_os = "linux")]`, but the
|
||||
// codegen runs on every host so `cargo check` on macOS still emits
|
||||
// the file (the include never actually compiles on non-Linux).
|
||||
emit_seccomp_policy();
|
||||
|
||||
// Only relevant when the serve feature is active.
|
||||
if std::env::var("CARGO_FEATURE_SERVE").is_err() {
|
||||
return;
|
||||
}
|
||||
|
|
@ -70,3 +77,209 @@ fn emit_placeholder_and_warn(dist_dir: &Path) {
|
|||
"cargo:warning=Node.js/npm not available — wrote placeholder frontend assets. Run 'cd frontend && npm install && npm run build' for the real UI."
|
||||
);
|
||||
}
|
||||
|
||||
// ── Phase 17 (Track E.1) — seccomp policy codegen ────────────────────────────
|
||||
|
||||
const SECCOMP_POLICY_PATH: &str = "src/dynamic/sandbox/seccomp/seccomp_policy.toml";
|
||||
|
||||
/// Cap-name → Cap bit value table. Mirrors the `bitflags!` block in
|
||||
/// `src/labels/mod.rs`. Keep in sync when adding/removing `Cap`
|
||||
/// constants.
|
||||
const CAP_BIT_FOR_NAME: &[(&str, u32)] = &[
|
||||
("ENV_VAR", 1 << 0),
|
||||
("HTML_ESCAPE", 1 << 1),
|
||||
("SHELL_ESCAPE", 1 << 2),
|
||||
("URL_ENCODE", 1 << 3),
|
||||
("JSON_PARSE", 1 << 4),
|
||||
("FILE_IO", 1 << 5),
|
||||
("FMT_STRING", 1 << 6),
|
||||
("SQL_QUERY", 1 << 7),
|
||||
("DESERIALIZE", 1 << 8),
|
||||
("SSRF", 1 << 9),
|
||||
("CODE_EXEC", 1 << 10),
|
||||
("CRYPTO", 1 << 11),
|
||||
("UNAUTHORIZED_ID", 1 << 12),
|
||||
("DATA_EXFIL", 1 << 13),
|
||||
("LDAP_INJECTION", 1 << 14),
|
||||
("XPATH_INJECTION", 1 << 15),
|
||||
("HEADER_INJECTION", 1 << 16),
|
||||
("OPEN_REDIRECT", 1 << 17),
|
||||
("SSTI", 1 << 18),
|
||||
("XXE", 1 << 19),
|
||||
("PROTOTYPE_POLLUTION", 1 << 20),
|
||||
];
|
||||
|
||||
fn emit_seccomp_policy() {
|
||||
println!("cargo:rerun-if-changed={}", SECCOMP_POLICY_PATH);
|
||||
|
||||
let out_dir = std::env::var("OUT_DIR").expect("OUT_DIR must be set by cargo");
|
||||
let out_path = Path::new(&out_dir).join("seccomp_policy.rs");
|
||||
|
||||
// Read the policy file; on missing file (e.g. fresh checkout on a
|
||||
// foreign target), emit empty tables so compilation still succeeds.
|
||||
let toml_text = match std::fs::read_to_string(SECCOMP_POLICY_PATH) {
|
||||
Ok(s) => s,
|
||||
Err(_) => {
|
||||
std::fs::write(
|
||||
&out_path,
|
||||
"pub static BASE: &[&str] = &[];\npub static CAP: &[(u32, &[&str])] = &[];\n",
|
||||
)
|
||||
.expect("write empty seccomp policy stub");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let parsed = parse_seccomp_toml(&toml_text);
|
||||
|
||||
let mut out = String::new();
|
||||
out.push_str("// generated by build.rs from seccomp_policy.toml — do not edit\n\n");
|
||||
|
||||
// Base allowlist.
|
||||
out.push_str("pub static BASE: &[&str] = &[\n");
|
||||
for name in &parsed.base {
|
||||
out.push_str(&format!(" \"{}\",\n", escape(name)));
|
||||
}
|
||||
out.push_str("];\n\n");
|
||||
|
||||
// Per-cap allowlists.
|
||||
out.push_str("pub static CAP: &[(u32, &[&str])] = &[\n");
|
||||
for (cap_name, allow) in &parsed.caps {
|
||||
let bit = CAP_BIT_FOR_NAME
|
||||
.iter()
|
||||
.find(|(n, _)| *n == cap_name.as_str())
|
||||
.map(|(_, b)| *b)
|
||||
.unwrap_or_else(|| panic!(
|
||||
"seccomp_policy.toml references unknown Cap '{cap_name}' — \
|
||||
add it to CAP_BIT_FOR_NAME in build.rs first"
|
||||
));
|
||||
out.push_str(&format!(" (0x{bit:08x}_u32, &[\n"));
|
||||
for name in allow {
|
||||
out.push_str(&format!(" \"{}\",\n", escape(name)));
|
||||
}
|
||||
out.push_str(" ]),\n");
|
||||
}
|
||||
out.push_str("];\n");
|
||||
|
||||
std::fs::write(&out_path, out).expect("write seccomp policy table");
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct SeccompPolicy {
|
||||
base: Vec<String>,
|
||||
caps: BTreeMap<String, Vec<String>>,
|
||||
}
|
||||
|
||||
/// Tiny line-oriented TOML parser scoped to the shape used by
|
||||
/// `seccomp_policy.toml`:
|
||||
///
|
||||
/// [base]
|
||||
/// allow = ["read", "write", ...]
|
||||
///
|
||||
/// [cap.SQL_QUERY]
|
||||
/// allow = [
|
||||
/// "fdatasync",
|
||||
/// ...
|
||||
/// ]
|
||||
///
|
||||
/// Comments (`#`) and blank lines are skipped. Multi-line array bodies
|
||||
/// are accumulated until the closing `]`.
|
||||
fn parse_seccomp_toml(src: &str) -> SeccompPolicy {
|
||||
let mut policy = SeccompPolicy::default();
|
||||
let mut current_section: Option<String> = None;
|
||||
let mut accumulating_array: Option<String> = None;
|
||||
let mut array_buf = String::new();
|
||||
|
||||
for raw_line in src.lines() {
|
||||
let line = strip_comment(raw_line).trim();
|
||||
if line.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(_key) = accumulating_array.as_ref() {
|
||||
array_buf.push_str(line);
|
||||
array_buf.push('\n');
|
||||
if line.contains(']') {
|
||||
let key = accumulating_array.take().unwrap();
|
||||
let values = parse_string_array(&array_buf);
|
||||
store_allow(&mut policy, current_section.as_deref(), &key, values);
|
||||
array_buf.clear();
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(section) = line.strip_prefix('[').and_then(|s| s.strip_suffix(']')) {
|
||||
current_section = Some(section.to_string());
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some((key, rest)) = line.split_once('=') {
|
||||
let key = key.trim().to_string();
|
||||
let rest = rest.trim();
|
||||
if rest.starts_with('[') && rest.contains(']') {
|
||||
let values = parse_string_array(rest);
|
||||
store_allow(&mut policy, current_section.as_deref(), &key, values);
|
||||
} else if rest.starts_with('[') {
|
||||
accumulating_array = Some(key);
|
||||
array_buf.push_str(rest);
|
||||
array_buf.push('\n');
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
policy
|
||||
}
|
||||
|
||||
fn strip_comment(line: &str) -> &str {
|
||||
let mut in_string = false;
|
||||
let bytes = line.as_bytes();
|
||||
for (i, &b) in bytes.iter().enumerate() {
|
||||
match b {
|
||||
b'"' => in_string = !in_string,
|
||||
b'#' if !in_string => return &line[..i],
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
line
|
||||
}
|
||||
|
||||
fn parse_string_array(src: &str) -> Vec<String> {
|
||||
// Find every "..." run between the first `[` and the last `]`.
|
||||
let start = src.find('[').map(|i| i + 1).unwrap_or(0);
|
||||
let end = src.rfind(']').unwrap_or(src.len());
|
||||
let body = &src[start..end];
|
||||
let mut out = Vec::new();
|
||||
let mut chars = body.chars().peekable();
|
||||
while let Some(c) = chars.next() {
|
||||
if c == '"' {
|
||||
let mut s = String::new();
|
||||
for c2 in chars.by_ref() {
|
||||
if c2 == '"' {
|
||||
break;
|
||||
}
|
||||
s.push(c2);
|
||||
}
|
||||
out.push(s);
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn store_allow(policy: &mut SeccompPolicy, section: Option<&str>, key: &str, values: Vec<String>) {
|
||||
if key != "allow" {
|
||||
return;
|
||||
}
|
||||
match section {
|
||||
Some("base") => policy.base = values,
|
||||
Some(other) => {
|
||||
if let Some(cap_name) = other.strip_prefix("cap.") {
|
||||
policy.caps.insert(cap_name.to_string(), values);
|
||||
}
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
|
||||
fn escape(s: &str) -> String {
|
||||
s.replace('\\', "\\\\").replace('"', "\\\"")
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,6 +29,14 @@ use std::path::Path;
|
|||
use std::sync::{Arc, OnceLock};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
pub mod process_linux;
|
||||
#[cfg(target_os = "linux")]
|
||||
pub mod seccomp;
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
pub use process_linux::{HardeningLevel, HardeningOutcome};
|
||||
|
||||
// ── Harness interpretation probe ──────────────────────────────────────────────
|
||||
|
||||
/// Returns true when the harness is driven by an interpreter (Python, Node, …)
|
||||
|
|
@ -159,6 +167,40 @@ pub struct SandboxOptions {
|
|||
/// into [`crate::dynamic::oracle::oracle_fired_with_stubs`].
|
||||
/// `None` when the spec's `stubs_required` is empty.
|
||||
pub stub_harness: Option<Arc<crate::dynamic::stubs::StubHarness>>,
|
||||
/// Phase 17 (Track E.1): cap bits used to minimise the seccomp-bpf
|
||||
/// allowlist applied to the Linux process backend. When `0`, the
|
||||
/// process backend installs only the cap-independent `base` allowlist
|
||||
/// from [`seccomp::seccomp_policy.toml`]; when non-zero, every cap bit
|
||||
/// set adds its allowlisted syscalls on top. Other backends ignore
|
||||
/// this field.
|
||||
pub seccomp_caps: u32,
|
||||
/// Phase 17 (Track E.1): hardening profile applied by the Linux
|
||||
/// process backend. See [`ProcessHardeningProfile`] for the per-
|
||||
/// variant primitive matrix.
|
||||
pub process_hardening: ProcessHardeningProfile,
|
||||
}
|
||||
|
||||
/// Phase 17 (Track E.1): selects which subset of the Linux process-
|
||||
/// backend hardening primitives is applied.
|
||||
///
|
||||
/// - [`ProcessHardeningProfile::Standard`] — the historical baseline:
|
||||
/// `prctl(PR_SET_NO_NEW_PRIVS)` + `setrlimit(RLIMIT_AS)` only. No
|
||||
/// namespaces, no chroot, no seccomp. Default for back-compat.
|
||||
/// - [`ProcessHardeningProfile::Strict`] — full Phase 17 sequence:
|
||||
/// no-new-privs, all rlimits, namespace unshare, chroot to workdir,
|
||||
/// default-deny seccomp filter scoped to [`SandboxOptions::seccomp_caps`].
|
||||
/// Each primitive is best-effort; failures degrade to
|
||||
/// [`HardeningLevel::Partial`] without aborting the run.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum ProcessHardeningProfile {
|
||||
Standard,
|
||||
Strict,
|
||||
}
|
||||
|
||||
impl Default for ProcessHardeningProfile {
|
||||
fn default() -> Self {
|
||||
ProcessHardeningProfile::Standard
|
||||
}
|
||||
}
|
||||
|
||||
impl SandboxOptions {
|
||||
|
|
@ -186,6 +228,8 @@ impl Default for SandboxOptions {
|
|||
probe_channel: None,
|
||||
extra_env: Vec::new(),
|
||||
stub_harness: None,
|
||||
seccomp_caps: 0,
|
||||
process_hardening: ProcessHardeningProfile::Standard,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1207,25 +1251,35 @@ fn run_process(
|
|||
cmd.env("NYX_PAYLOAD", std::ffi::OsStr::from_bytes(payload_bytes));
|
||||
}
|
||||
|
||||
// Enforce memory cap before exec on Linux via RLIMIT_AS + PR_SET_NO_NEW_PRIVS.
|
||||
// RLIMIT_AS limits total virtual address space. Python uses significantly
|
||||
// more virtual AS than RSS (shared libs, mmap arenas), so the enforced
|
||||
// limit is memory_mib * 8 with a floor of 4 GiB.
|
||||
// Phase 17 (Track E.1): install the Linux process-backend hardening
|
||||
// sequence — `prctl(PR_SET_NO_NEW_PRIVS)`, `setrlimit` (CPU/NOFILE/AS),
|
||||
// `unshare(CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUSER)`, `chroot` to the
|
||||
// workdir, and a default-deny seccomp-bpf filter scoped to
|
||||
// `opts.seccomp_caps`. Each primitive is best-effort: failures
|
||||
// downgrade to `HardeningLevel::Partial` instead of aborting the run.
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
use std::os::unix::process::CommandExt;
|
||||
let memory_mib = opts.memory_mib;
|
||||
// Safety: called in the child after fork but before exec; no allocator use.
|
||||
unsafe {
|
||||
cmd.pre_exec(move || {
|
||||
rlimit_as_linux(memory_mib)?;
|
||||
prctl_no_new_privs()
|
||||
});
|
||||
}
|
||||
}
|
||||
let collector = process_linux::install_pre_exec(&mut cmd, opts, &harness.workdir);
|
||||
|
||||
let start = Instant::now();
|
||||
let mut child = cmd.spawn().map_err(SandboxError::Spawn)?;
|
||||
let child_result = cmd.spawn();
|
||||
#[cfg(target_os = "linux")]
|
||||
let outcome_joiner;
|
||||
let mut child = match child_result {
|
||||
Ok(c) => {
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
outcome_joiner = collector.map(|c| c.after_spawn());
|
||||
}
|
||||
c
|
||||
}
|
||||
Err(e) => {
|
||||
#[cfg(target_os = "linux")]
|
||||
if let Some(c) = collector {
|
||||
c.forget();
|
||||
}
|
||||
return Err(SandboxError::Spawn(e));
|
||||
}
|
||||
};
|
||||
|
||||
let timeout = opts.timeout;
|
||||
let timed_out = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false));
|
||||
|
|
@ -1270,6 +1324,14 @@ fn run_process(
|
|||
|
||||
let status = child.wait().map_err(SandboxError::Io)?;
|
||||
|
||||
// Phase 17 (Track E.1): wait for the per-primitive HardeningOutcome
|
||||
// drain thread before returning so callers (tests + telemetry) read
|
||||
// a settled value via `process_linux::last_hardening_outcome()`.
|
||||
#[cfg(target_os = "linux")]
|
||||
if let Some(joiner) = outcome_joiner {
|
||||
joiner.await_outcome();
|
||||
}
|
||||
|
||||
let stdout_buf = stdout_handle
|
||||
.and_then(|h| h.join().ok())
|
||||
.and_then(|r| r.ok())
|
||||
|
|
@ -1337,52 +1399,9 @@ fn base64_encode(data: &[u8]) -> String {
|
|||
|
||||
// ── Linux-specific syscall wrappers ──────────────────────────────────────────
|
||||
|
||||
/// Set RLIMIT_AS (virtual address space) in a `pre_exec` context on Linux.
|
||||
///
|
||||
/// `memory_mib` is the configured cap; we enforce `max(memory_mib * 8, 4096)`
|
||||
/// MiB of virtual AS to give Python's mmap-heavy runtime adequate headroom
|
||||
/// while still capping runaway memory bombs.
|
||||
///
|
||||
/// RLIMIT_AS = 9 on x86_64, aarch64, arm, ppc64, s390x, and all other major
|
||||
/// Linux architectures (kernel source: include/uapi/asm-generic/resource.h).
|
||||
#[cfg(target_os = "linux")]
|
||||
fn rlimit_as_linux(memory_mib: u64) -> std::io::Result<()> {
|
||||
#[repr(C)]
|
||||
struct Rlimit {
|
||||
cur: u64,
|
||||
max: u64,
|
||||
}
|
||||
unsafe extern "C" {
|
||||
fn setrlimit(resource: i32, rlim: *const Rlimit) -> i32;
|
||||
}
|
||||
const RLIMIT_AS: i32 = 9;
|
||||
let cap_mib = memory_mib.saturating_mul(8).max(4096);
|
||||
let bytes = cap_mib.saturating_mul(1024 * 1024);
|
||||
let rl = Rlimit { cur: bytes, max: bytes };
|
||||
let ret = unsafe { setrlimit(RLIMIT_AS, &rl) };
|
||||
if ret == 0 {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(std::io::Error::last_os_error())
|
||||
}
|
||||
}
|
||||
|
||||
/// Set PR_SET_NO_NEW_PRIVS to 1 in a `pre_exec` context on Linux.
|
||||
///
|
||||
/// This prevents the child process from acquiring new privileges via setuid
|
||||
/// binaries, file capabilities, or ptrace. Best-effort: silently succeeds
|
||||
/// even if the prctl call fails (e.g., in restricted environments).
|
||||
#[cfg(target_os = "linux")]
|
||||
fn prctl_no_new_privs() -> std::io::Result<()> {
|
||||
unsafe extern "C" {
|
||||
fn prctl(option: i32, arg2: u64, arg3: u64, arg4: u64, arg5: u64) -> i32;
|
||||
}
|
||||
const PR_SET_NO_NEW_PRIVS: i32 = 38;
|
||||
// Failure is non-fatal: some container runtimes block prctl but are
|
||||
// themselves already sandboxed. Don't abort the child for this.
|
||||
unsafe { prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
|
||||
Ok(())
|
||||
}
|
||||
// `rlimit_as_linux`, `prctl_no_new_privs`, and the rest of the Linux process
|
||||
// backend hardening sequence now live in [`process_linux`]. See
|
||||
// [`process_linux::install_pre_exec`] for the call-site.
|
||||
|
||||
#[cfg(unix)]
|
||||
fn libc_kill(pid: i32, sig: i32) -> i32 {
|
||||
657
src/dynamic/sandbox/process_linux.rs
Normal file
657
src/dynamic/sandbox/process_linux.rs
Normal file
|
|
@ -0,0 +1,657 @@
|
|||
//! Phase 17 (Track E.1) — Linux process backend hardening.
|
||||
//!
|
||||
//! Owns the `pre_exec` sequence applied to every harness child started by
|
||||
//! [`super::run_process`] on Linux:
|
||||
//!
|
||||
//! 1. `prctl(PR_SET_NO_NEW_PRIVS)` — block setuid / file-cap escalation.
|
||||
//! 2. `setrlimit(RLIMIT_CPU)` — cap CPU time so a runaway payload exits.
|
||||
//! 3. `setrlimit(RLIMIT_NOFILE)` — cap open fds; the harness receives only
|
||||
//! a small number of stdio + probe fds from the parent.
|
||||
//! 4. `setrlimit(RLIMIT_AS)` — cap virtual address space; multiplied by 8
|
||||
//! with a 4 GiB floor so interpreted runtimes still start.
|
||||
//! 5. `unshare(CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNS)` — drop the
|
||||
//! host PID, mount, and user namespace views.
|
||||
//! 6. `chroot(workdir)` + `chdir("/")` — isolate filesystem reach to the
|
||||
//! harness workdir; payloads that try to read `/etc/passwd` see the
|
||||
//! harness root, not the host one.
|
||||
//! 7. seccomp-bpf default-deny filter scoped to the cap bits the spec
|
||||
//! actually exercises (see [`super::seccomp`]).
|
||||
//!
|
||||
//! Each primitive is best-effort: failures are recorded into the per-
|
||||
//! child [`HardeningOutcome`] file the parent reads back after exec, so
|
||||
//! the verifier can downgrade to [`HardeningLevel::Partial`] without
|
||||
//! aborting the harness run.
|
||||
//!
|
||||
//! The pre_exec callback runs in the child between fork(2) and execve(2)
|
||||
//! — no Rust allocator use, no heap-borrowing closures. Anything the
|
||||
//! parent needs to know is shipped through an `O_CLOEXEC` pipe the
|
||||
//! parent owns the read end of: the child writes one [`HardeningOutcome`]
|
||||
//! record into it, execve(2) drops the write end, and the parent's
|
||||
//! drain thread sees EOF and records the outcome.
|
||||
|
||||
use crate::dynamic::sandbox::seccomp;
|
||||
use crate::dynamic::sandbox::seccomp::bpf::SockFilter;
|
||||
use crate::dynamic::sandbox::{ProcessHardeningProfile, SandboxOptions};
|
||||
use std::io::Read;
|
||||
use std::os::unix::io::{FromRawFd, RawFd};
|
||||
use std::os::unix::process::CommandExt;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::sync::{Arc, Mutex, OnceLock};
|
||||
|
||||
// ── HardeningLevel reporting ─────────────────────────────────────────────────
|
||||
|
||||
/// Coarse summary of which Phase 17 primitives applied successfully.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum HardeningLevel {
|
||||
/// Standard profile selected — only no-new-privs + RLIMIT_AS were
|
||||
/// installed (no Phase 17 hardening attempted).
|
||||
Baseline,
|
||||
/// All requested primitives applied successfully.
|
||||
Full,
|
||||
/// At least one primitive failed (typically because the process is
|
||||
/// already inside a sandbox that disallows e.g. `unshare`).
|
||||
Partial,
|
||||
/// Every primitive failed; the harness ran with no Phase 17
|
||||
/// hardening at all.
|
||||
None,
|
||||
}
|
||||
|
||||
/// Per-primitive outcome captured by the child and read back by the
|
||||
/// parent after `wait`.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct HardeningOutcome {
|
||||
pub no_new_privs: PrimitiveStatus,
|
||||
pub rlimit_cpu: PrimitiveStatus,
|
||||
pub rlimit_nofile: PrimitiveStatus,
|
||||
pub rlimit_as: PrimitiveStatus,
|
||||
pub unshare: PrimitiveStatus,
|
||||
pub chroot: PrimitiveStatus,
|
||||
pub seccomp: PrimitiveStatus,
|
||||
pub profile: ProcessHardeningProfileTag,
|
||||
}
|
||||
|
||||
impl Default for HardeningOutcome {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
no_new_privs: PrimitiveStatus::Skipped,
|
||||
rlimit_cpu: PrimitiveStatus::Skipped,
|
||||
rlimit_nofile: PrimitiveStatus::Skipped,
|
||||
rlimit_as: PrimitiveStatus::Skipped,
|
||||
unshare: PrimitiveStatus::Skipped,
|
||||
chroot: PrimitiveStatus::Skipped,
|
||||
seccomp: PrimitiveStatus::Skipped,
|
||||
profile: ProcessHardeningProfileTag::Standard,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
|
||||
pub enum PrimitiveStatus {
|
||||
/// Primitive was not requested by the active profile.
|
||||
#[default]
|
||||
Skipped,
|
||||
/// Primitive applied successfully.
|
||||
Applied,
|
||||
/// Primitive call returned an error; raw errno is captured below.
|
||||
Failed(i32),
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
|
||||
pub enum ProcessHardeningProfileTag {
|
||||
#[default]
|
||||
Standard,
|
||||
Strict,
|
||||
}
|
||||
|
||||
impl HardeningOutcome {
|
||||
/// Coarse summary used for the `HardeningLevel` column.
|
||||
pub fn level(&self) -> HardeningLevel {
|
||||
if matches!(self.profile, ProcessHardeningProfileTag::Standard) {
|
||||
return HardeningLevel::Baseline;
|
||||
}
|
||||
let primitives = [
|
||||
self.no_new_privs,
|
||||
self.rlimit_cpu,
|
||||
self.rlimit_nofile,
|
||||
self.rlimit_as,
|
||||
self.unshare,
|
||||
self.chroot,
|
||||
self.seccomp,
|
||||
];
|
||||
let applied = primitives.iter().filter(|s| matches!(s, PrimitiveStatus::Applied)).count();
|
||||
let failed = primitives.iter().filter(|s| matches!(s, PrimitiveStatus::Failed(_))).count();
|
||||
match (applied, failed) {
|
||||
(_, 0) => HardeningLevel::Full,
|
||||
(0, _) => HardeningLevel::None,
|
||||
_ => HardeningLevel::Partial,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Last outcome registry (read back by tests + telemetry) ───────────────────
|
||||
|
||||
static LAST_OUTCOME: OnceLock<Mutex<Option<HardeningOutcome>>> = OnceLock::new();
|
||||
|
||||
fn outcome_cell() -> &'static Mutex<Option<HardeningOutcome>> {
|
||||
LAST_OUTCOME.get_or_init(|| Mutex::new(None))
|
||||
}
|
||||
|
||||
fn record_outcome(outcome: HardeningOutcome) {
|
||||
if let Ok(mut g) = outcome_cell().lock() {
|
||||
*g = Some(outcome);
|
||||
}
|
||||
}
|
||||
|
||||
/// Snapshot of the most-recent hardening outcome. Returns `None` until
|
||||
/// at least one [`install_pre_exec`] child has been spawned and waited
|
||||
/// on. Tests + telemetry read this after `wait_for_outcome` to get the
|
||||
/// per-primitive status table.
|
||||
pub fn last_hardening_outcome() -> Option<HardeningOutcome> {
|
||||
outcome_cell().lock().ok().and_then(|g| *g)
|
||||
}
|
||||
|
||||
/// Reset the last-outcome slot. Tests use this between cases so a stale
|
||||
/// value from a prior spawn cannot leak into the assertion under test.
|
||||
pub fn reset_last_hardening_outcome() {
|
||||
if let Ok(mut g) = outcome_cell().lock() {
|
||||
*g = None;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Status pipe between parent and child ─────────────────────────────────────
|
||||
|
||||
struct StatusPipe {
|
||||
write_fd: RawFd,
|
||||
read_fd: RawFd,
|
||||
}
|
||||
|
||||
impl StatusPipe {
|
||||
fn new() -> std::io::Result<Self> {
|
||||
unsafe extern "C" {
|
||||
fn pipe2(pipefd: *mut i32, flags: i32) -> i32;
|
||||
}
|
||||
const O_CLOEXEC: i32 = 0o2_000_000;
|
||||
let mut fds = [-1_i32; 2];
|
||||
let ret = unsafe { pipe2(fds.as_mut_ptr(), O_CLOEXEC) };
|
||||
if ret != 0 {
|
||||
return Err(std::io::Error::last_os_error());
|
||||
}
|
||||
Ok(Self { write_fd: fds[1], read_fd: fds[0] })
|
||||
}
|
||||
}
|
||||
|
||||
fn close_fd(fd: RawFd) {
|
||||
unsafe extern "C" {
|
||||
fn close(fd: i32) -> i32;
|
||||
}
|
||||
unsafe { close(fd) };
|
||||
}
|
||||
|
||||
/// Drain `read_fd` into a `HardeningOutcome`. Wire format is the
|
||||
/// 15-byte fixed-width record produced by [`encode_outcome`].
|
||||
fn drain_outcome(read_fd: RawFd) -> Option<HardeningOutcome> {
|
||||
let mut file = unsafe { std::fs::File::from_raw_fd(read_fd) };
|
||||
let mut buf = Vec::with_capacity(64);
|
||||
if file.read_to_end(&mut buf).is_err() {
|
||||
return None;
|
||||
}
|
||||
decode_outcome(&buf)
|
||||
}
|
||||
|
||||
const OUTCOME_LEN: usize = 1 + 7 * 2;
|
||||
|
||||
/// Decode a 15-byte hardening outcome record:
|
||||
/// `[profile_tag, no_new_privs_tag, no_new_privs_errno_lo,
|
||||
/// rlimit_cpu_tag, rlimit_cpu_errno_lo, ..., seccomp_tag, seccomp_errno_lo]`
|
||||
/// All errnos are clamped to the low byte for the wire (true value is
|
||||
/// recovered post-hoc from `errno`-symbolic context if needed).
|
||||
fn decode_outcome(buf: &[u8]) -> Option<HardeningOutcome> {
|
||||
if buf.len() < OUTCOME_LEN {
|
||||
return None;
|
||||
}
|
||||
let profile = match buf[0] {
|
||||
1 => ProcessHardeningProfileTag::Strict,
|
||||
_ => ProcessHardeningProfileTag::Standard,
|
||||
};
|
||||
let mut idx = 1;
|
||||
let mut next = || -> PrimitiveStatus {
|
||||
let tag = buf[idx];
|
||||
let errno = buf[idx + 1] as i32;
|
||||
idx += 2;
|
||||
match tag {
|
||||
0 => PrimitiveStatus::Skipped,
|
||||
1 => PrimitiveStatus::Applied,
|
||||
_ => PrimitiveStatus::Failed(if errno == 0 { -1 } else { errno }),
|
||||
}
|
||||
};
|
||||
let no_new_privs = next();
|
||||
let rlimit_cpu = next();
|
||||
let rlimit_nofile = next();
|
||||
let rlimit_as = next();
|
||||
let unshare = next();
|
||||
let chroot = next();
|
||||
let seccomp = next();
|
||||
Some(HardeningOutcome {
|
||||
no_new_privs,
|
||||
rlimit_cpu,
|
||||
rlimit_nofile,
|
||||
rlimit_as,
|
||||
unshare,
|
||||
chroot,
|
||||
seccomp,
|
||||
profile,
|
||||
})
|
||||
}
|
||||
|
||||
fn encode_outcome(out: &HardeningOutcome) -> [u8; OUTCOME_LEN] {
|
||||
let mut buf = [0_u8; OUTCOME_LEN];
|
||||
buf[0] = match out.profile {
|
||||
ProcessHardeningProfileTag::Standard => 0,
|
||||
ProcessHardeningProfileTag::Strict => 1,
|
||||
};
|
||||
let mut idx = 1;
|
||||
for status in [
|
||||
out.no_new_privs,
|
||||
out.rlimit_cpu,
|
||||
out.rlimit_nofile,
|
||||
out.rlimit_as,
|
||||
out.unshare,
|
||||
out.chroot,
|
||||
out.seccomp,
|
||||
] {
|
||||
let (tag, errno) = match status {
|
||||
PrimitiveStatus::Skipped => (0_u8, 0_u8),
|
||||
PrimitiveStatus::Applied => (1_u8, 0_u8),
|
||||
PrimitiveStatus::Failed(e) => (2_u8, (e.unsigned_abs() & 0xff) as u8),
|
||||
};
|
||||
buf[idx] = tag;
|
||||
buf[idx + 1] = errno;
|
||||
idx += 2;
|
||||
}
|
||||
buf
|
||||
}
|
||||
|
||||
// ── Primitive wrappers (called from the child's pre_exec) ────────────────────
|
||||
|
||||
const RLIMIT_CPU: i32 = 0;
|
||||
const RLIMIT_NOFILE: i32 = 7;
|
||||
const RLIMIT_AS: i32 = 9;
|
||||
|
||||
const PR_SET_NO_NEW_PRIVS: i32 = 38;
|
||||
|
||||
const CLONE_NEWNS: i32 = 0x0002_0000;
|
||||
const CLONE_NEWUSER: i32 = 0x1000_0000;
|
||||
const CLONE_NEWPID: i32 = 0x2000_0000;
|
||||
|
||||
#[repr(C)]
|
||||
struct Rlimit {
|
||||
cur: u64,
|
||||
max: u64,
|
||||
}
|
||||
|
||||
unsafe extern "C" {
|
||||
fn setrlimit(resource: i32, rlim: *const Rlimit) -> i32;
|
||||
fn prctl(option: i32, arg2: u64, arg3: u64, arg4: u64, arg5: u64) -> i32;
|
||||
fn unshare(flags: i32) -> i32;
|
||||
fn chroot(path: *const i8) -> i32;
|
||||
fn chdir(path: *const i8) -> i32;
|
||||
fn write(fd: i32, buf: *const u8, count: usize) -> isize;
|
||||
fn __errno_location() -> *mut i32;
|
||||
}
|
||||
|
||||
fn last_errno() -> i32 {
|
||||
unsafe { *__errno_location() }
|
||||
}
|
||||
|
||||
fn apply_rlimit(resource: i32, bytes: u64) -> PrimitiveStatus {
|
||||
let rl = Rlimit { cur: bytes, max: bytes };
|
||||
let ret = unsafe { setrlimit(resource, &rl) };
|
||||
if ret == 0 {
|
||||
PrimitiveStatus::Applied
|
||||
} else {
|
||||
PrimitiveStatus::Failed(last_errno())
|
||||
}
|
||||
}
|
||||
|
||||
fn apply_no_new_privs() -> PrimitiveStatus {
|
||||
let ret = unsafe { prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
|
||||
if ret == 0 {
|
||||
PrimitiveStatus::Applied
|
||||
} else {
|
||||
PrimitiveStatus::Failed(last_errno())
|
||||
}
|
||||
}
|
||||
|
||||
fn apply_unshare() -> PrimitiveStatus {
|
||||
// CLONE_NEWUSER must come first on most modern kernels so the
|
||||
// unprivileged caller can map uid/gid; CLONE_NEWPID + CLONE_NEWNS
|
||||
// then succeed because the new user namespace owns them.
|
||||
let flags = CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNS;
|
||||
let ret = unsafe { unshare(flags) };
|
||||
if ret == 0 {
|
||||
PrimitiveStatus::Applied
|
||||
} else {
|
||||
PrimitiveStatus::Failed(last_errno())
|
||||
}
|
||||
}
|
||||
|
||||
fn apply_chroot(workdir: &[u8]) -> PrimitiveStatus {
|
||||
// `workdir` is NUL-terminated by `canonicalize_workdir` so we can
|
||||
// hand the bytes straight to `chroot(2)` without allocating in
|
||||
// pre_exec.
|
||||
let ret = unsafe { chroot(workdir.as_ptr() as *const i8) };
|
||||
if ret != 0 {
|
||||
return PrimitiveStatus::Failed(last_errno());
|
||||
}
|
||||
let root = b"/\0";
|
||||
let ret = unsafe { chdir(root.as_ptr() as *const i8) };
|
||||
if ret != 0 {
|
||||
return PrimitiveStatus::Failed(last_errno());
|
||||
}
|
||||
PrimitiveStatus::Applied
|
||||
}
|
||||
|
||||
/// Install a pre-compiled seccomp BPF filter on the calling thread.
|
||||
///
|
||||
/// `program` is a heap-allocated BPF instruction array compiled in the
|
||||
/// parent (`build_plan`) and shared via `Arc` so the child does not have
|
||||
/// to allocate during pre_exec.
|
||||
fn apply_seccomp(program: &[SockFilter]) -> PrimitiveStatus {
|
||||
match seccomp::install_compiled_filter(program) {
|
||||
Ok(()) => PrimitiveStatus::Applied,
|
||||
Err(e) => PrimitiveStatus::Failed(e.raw_os_error().unwrap_or(-1)),
|
||||
}
|
||||
}
|
||||
|
||||
// ── Pre-exec installer ───────────────────────────────────────────────────────
|
||||
|
||||
#[derive(Clone)]
|
||||
struct PreExecPlan {
|
||||
rlimit_cpu_seconds: u64,
|
||||
rlimit_nofile: u64,
|
||||
rlimit_as_bytes: u64,
|
||||
workdir_nul: Vec<u8>,
|
||||
/// Pre-compiled BPF program for the requested cap-bits. Built in
|
||||
/// the parent so the child's pre_exec callback never touches the
|
||||
/// allocator.
|
||||
seccomp_program: Arc<Vec<SockFilter>>,
|
||||
profile: ProcessHardeningProfileTag,
|
||||
}
|
||||
|
||||
/// Returned by [`install_pre_exec`]. The caller MUST invoke either
|
||||
/// [`OutcomeCollector::after_spawn`] or [`OutcomeCollector::forget`]
|
||||
/// after `cmd.spawn()` returns — the parent's write-fd has to close so
|
||||
/// the read end sees EOF and the drain thread terminates.
|
||||
pub struct OutcomeCollector {
|
||||
write_fd: RawFd,
|
||||
read_fd: RawFd,
|
||||
}
|
||||
|
||||
/// Background-drain handle returned by [`OutcomeCollector::after_spawn`].
|
||||
/// `run_process` awaits this after `child.wait()` so the outcome is
|
||||
/// guaranteed to be in the registry before the function returns; tests
|
||||
/// that bypass `run_process` can call [`OutcomeJoiner::await_outcome`]
|
||||
/// themselves.
|
||||
pub struct OutcomeJoiner {
|
||||
handle: Option<std::thread::JoinHandle<()>>,
|
||||
}
|
||||
|
||||
impl OutcomeJoiner {
|
||||
/// Block until the drain thread finishes recording the outcome.
|
||||
pub fn await_outcome(mut self) {
|
||||
if let Some(h) = self.handle.take() {
|
||||
let _ = h.join();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for OutcomeJoiner {
|
||||
fn drop(&mut self) {
|
||||
if let Some(h) = self.handle.take() {
|
||||
let _ = h.join();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl OutcomeCollector {
|
||||
/// Call after `cmd.spawn()` returns `Ok`. Closes the parent's copy
|
||||
/// of the write fd so the kernel ref-count drops to whatever the
|
||||
/// child is still holding; once execve(2) closes the child's
|
||||
/// O_CLOEXEC copy too, the read end sees EOF and the drain thread
|
||||
/// records the outcome via [`record_outcome`]. Returns a join
|
||||
/// handle the caller can await to know the outcome is settled.
|
||||
pub fn after_spawn(self) -> OutcomeJoiner {
|
||||
close_fd(self.write_fd);
|
||||
let read_fd = self.read_fd;
|
||||
let handle = std::thread::spawn(move || {
|
||||
if let Some(outcome) = drain_outcome(read_fd) {
|
||||
record_outcome(outcome);
|
||||
}
|
||||
});
|
||||
OutcomeJoiner { handle: Some(handle) }
|
||||
}
|
||||
|
||||
/// Call when `cmd.spawn()` failed. Closes both ends so neither fd
|
||||
/// leaks; no outcome is recorded.
|
||||
pub fn forget(self) {
|
||||
close_fd(self.write_fd);
|
||||
close_fd(self.read_fd);
|
||||
}
|
||||
}
|
||||
|
||||
/// Install the Phase 17 hardening sequence on `cmd`.
|
||||
///
|
||||
/// Returns `Some(collector)` when the status pipe was successfully
|
||||
/// created; the caller must invoke
|
||||
/// [`OutcomeCollector::after_spawn`] after a successful `cmd.spawn()`.
|
||||
/// Returns `None` when pipe creation itself failed (rare:
|
||||
/// `EMFILE`/`ENFILE`). In that case the pre_exec hook is still
|
||||
/// installed — the child still gets the full hardening sequence — but
|
||||
/// the per-primitive outcome cannot be reported back to the parent.
|
||||
pub fn install_pre_exec(
|
||||
cmd: &mut Command,
|
||||
opts: &SandboxOptions,
|
||||
workdir: &Path,
|
||||
) -> Option<OutcomeCollector> {
|
||||
let plan = build_plan(opts, workdir);
|
||||
|
||||
let pipe = StatusPipe::new().ok();
|
||||
let write_fd = pipe.as_ref().map(|p| p.write_fd).unwrap_or(-1);
|
||||
let read_fd = pipe.as_ref().map(|p| p.read_fd);
|
||||
let plan_for_child = plan.clone();
|
||||
|
||||
// Safety: pre_exec runs after fork(2) and before execve(2). We must
|
||||
// not allocate, take any locks, or call into the Rust runtime. The
|
||||
// captured `plan_for_child` is moved in; reading its already-allocated
|
||||
// fields is safe because no allocator call is needed.
|
||||
unsafe {
|
||||
cmd.pre_exec(move || {
|
||||
let outcome = run_pre_exec_in_child(&plan_for_child);
|
||||
if write_fd >= 0 {
|
||||
let bytes = encode_outcome(&outcome);
|
||||
let _ = write(write_fd, bytes.as_ptr(), bytes.len());
|
||||
// execve(2) closes write_fd via O_CLOEXEC; no manual
|
||||
// close needed here.
|
||||
}
|
||||
Ok(())
|
||||
});
|
||||
}
|
||||
read_fd.map(|read_fd| OutcomeCollector { write_fd, read_fd })
|
||||
}
|
||||
|
||||
fn run_pre_exec_in_child(plan: &PreExecPlan) -> HardeningOutcome {
|
||||
let mut outcome = HardeningOutcome::default();
|
||||
outcome.profile = plan.profile;
|
||||
|
||||
// ── Always-on: PR_SET_NO_NEW_PRIVS + RLIMIT_AS ───────────────────────
|
||||
outcome.no_new_privs = apply_no_new_privs();
|
||||
outcome.rlimit_as = apply_rlimit(RLIMIT_AS, plan.rlimit_as_bytes);
|
||||
|
||||
if matches!(plan.profile, ProcessHardeningProfileTag::Standard) {
|
||||
return outcome;
|
||||
}
|
||||
|
||||
// ── Strict profile: rlimits, unshare, chroot, seccomp ────────────────
|
||||
outcome.rlimit_cpu = apply_rlimit(RLIMIT_CPU, plan.rlimit_cpu_seconds);
|
||||
outcome.rlimit_nofile = apply_rlimit(RLIMIT_NOFILE, plan.rlimit_nofile);
|
||||
outcome.unshare = apply_unshare();
|
||||
outcome.chroot = apply_chroot(&plan.workdir_nul);
|
||||
// seccomp is applied last so the filter does not block any of the
|
||||
// earlier syscalls (setrlimit, prctl, unshare, chroot, chdir).
|
||||
outcome.seccomp = apply_seccomp(plan.seccomp_program.as_slice());
|
||||
|
||||
outcome
|
||||
}
|
||||
|
||||
fn build_plan(opts: &SandboxOptions, workdir: &Path) -> PreExecPlan {
|
||||
let memory_mib = opts.memory_mib;
|
||||
let cap_mib = memory_mib.saturating_mul(8).max(4096);
|
||||
let rlimit_as_bytes = cap_mib.saturating_mul(1024 * 1024);
|
||||
|
||||
let timeout_secs = opts.timeout.as_secs().max(1);
|
||||
let rlimit_cpu_seconds = timeout_secs.saturating_mul(2).max(2);
|
||||
|
||||
let workdir_nul = canonicalize_workdir(workdir);
|
||||
|
||||
// Pre-compile the BPF program in the parent so the pre_exec
|
||||
// callback (which must not allocate) can hand it straight to
|
||||
// `prctl(PR_SET_SECCOMP)`.
|
||||
let nrs = seccomp::allowed_syscall_numbers(opts.seccomp_caps);
|
||||
let program = seccomp::bpf::compile(&nrs, seccomp::syscalls::AUDIT_ARCH);
|
||||
|
||||
PreExecPlan {
|
||||
rlimit_cpu_seconds,
|
||||
rlimit_nofile: 256,
|
||||
rlimit_as_bytes,
|
||||
workdir_nul,
|
||||
seccomp_program: Arc::new(program),
|
||||
profile: match opts.process_hardening {
|
||||
ProcessHardeningProfile::Standard => ProcessHardeningProfileTag::Standard,
|
||||
ProcessHardeningProfile::Strict => ProcessHardeningProfileTag::Strict,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn canonicalize_workdir(workdir: &Path) -> Vec<u8> {
|
||||
let canonical: PathBuf = std::fs::canonicalize(workdir).unwrap_or_else(|_| workdir.to_path_buf());
|
||||
let mut bytes = canonical.into_os_string().into_encoded_bytes();
|
||||
if !bytes.ends_with(&[0]) {
|
||||
bytes.push(0);
|
||||
}
|
||||
bytes
|
||||
}
|
||||
|
||||
// ── Tests ────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn outcome_codec_round_trip_strict_full() {
|
||||
let out = HardeningOutcome {
|
||||
no_new_privs: PrimitiveStatus::Applied,
|
||||
rlimit_cpu: PrimitiveStatus::Applied,
|
||||
rlimit_nofile: PrimitiveStatus::Applied,
|
||||
rlimit_as: PrimitiveStatus::Applied,
|
||||
unshare: PrimitiveStatus::Applied,
|
||||
chroot: PrimitiveStatus::Applied,
|
||||
seccomp: PrimitiveStatus::Applied,
|
||||
profile: ProcessHardeningProfileTag::Strict,
|
||||
};
|
||||
let bytes = encode_outcome(&out);
|
||||
let decoded = decode_outcome(&bytes).expect("decode");
|
||||
assert_eq!(decoded, out);
|
||||
assert_eq!(decoded.level(), HardeningLevel::Full);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn outcome_codec_round_trip_partial() {
|
||||
let out = HardeningOutcome {
|
||||
no_new_privs: PrimitiveStatus::Applied,
|
||||
rlimit_cpu: PrimitiveStatus::Applied,
|
||||
rlimit_nofile: PrimitiveStatus::Failed(13),
|
||||
rlimit_as: PrimitiveStatus::Applied,
|
||||
unshare: PrimitiveStatus::Failed(1),
|
||||
chroot: PrimitiveStatus::Failed(13),
|
||||
seccomp: PrimitiveStatus::Applied,
|
||||
profile: ProcessHardeningProfileTag::Strict,
|
||||
};
|
||||
let bytes = encode_outcome(&out);
|
||||
let decoded = decode_outcome(&bytes).expect("decode");
|
||||
assert_eq!(decoded, out);
|
||||
assert_eq!(decoded.level(), HardeningLevel::Partial);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn standard_profile_reports_baseline_level() {
|
||||
let out = HardeningOutcome {
|
||||
no_new_privs: PrimitiveStatus::Applied,
|
||||
rlimit_as: PrimitiveStatus::Applied,
|
||||
profile: ProcessHardeningProfileTag::Standard,
|
||||
..HardeningOutcome::default()
|
||||
};
|
||||
assert_eq!(out.level(), HardeningLevel::Baseline);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_plan_pads_workdir_with_nul() {
|
||||
let opts = SandboxOptions::default();
|
||||
let plan = build_plan(&opts, std::path::Path::new("/tmp"));
|
||||
assert!(plan.workdir_nul.ends_with(&[0]));
|
||||
assert_eq!(plan.profile, ProcessHardeningProfileTag::Standard);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_plan_strict_compiles_seccomp_program() {
|
||||
let opts = SandboxOptions {
|
||||
seccomp_caps: 0xff,
|
||||
process_hardening: ProcessHardeningProfile::Strict,
|
||||
..SandboxOptions::default()
|
||||
};
|
||||
let plan = build_plan(&opts, std::path::Path::new("/tmp"));
|
||||
// The arch check + ld nr + KILL + ALLOW alone are 5 instructions;
|
||||
// the BASE allowlist adds dozens more.
|
||||
assert!(plan.seccomp_program.len() > 5, "BPF program too small: {}", plan.seccomp_program.len());
|
||||
assert_eq!(plan.profile, ProcessHardeningProfileTag::Strict);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rlimit_as_bytes_floors_at_4_gib() {
|
||||
let opts = SandboxOptions { memory_mib: 1, ..SandboxOptions::default() };
|
||||
let plan = build_plan(&opts, std::path::Path::new("/tmp"));
|
||||
assert_eq!(plan.rlimit_as_bytes, 4096_u64 * 1024 * 1024);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rlimit_as_bytes_scales_with_memory_mib() {
|
||||
let opts = SandboxOptions { memory_mib: 1024, ..SandboxOptions::default() };
|
||||
let plan = build_plan(&opts, std::path::Path::new("/tmp"));
|
||||
// 1024 MiB * 8 = 8192 MiB
|
||||
assert_eq!(plan.rlimit_as_bytes, 8192_u64 * 1024 * 1024);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn truncated_buffer_decodes_to_none() {
|
||||
assert!(decode_outcome(&[]).is_none());
|
||||
assert!(decode_outcome(&[0_u8; OUTCOME_LEN - 1]).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn record_and_reset_round_trip() {
|
||||
let original = last_hardening_outcome();
|
||||
let probe = HardeningOutcome {
|
||||
no_new_privs: PrimitiveStatus::Applied,
|
||||
profile: ProcessHardeningProfileTag::Strict,
|
||||
..HardeningOutcome::default()
|
||||
};
|
||||
record_outcome(probe);
|
||||
assert_eq!(last_hardening_outcome(), Some(probe));
|
||||
reset_last_hardening_outcome();
|
||||
assert!(last_hardening_outcome().is_none());
|
||||
if let Some(prev) = original {
|
||||
record_outcome(prev);
|
||||
}
|
||||
}
|
||||
}
|
||||
173
src/dynamic/sandbox/seccomp/bpf.rs
Normal file
173
src/dynamic/sandbox/seccomp/bpf.rs
Normal file
|
|
@ -0,0 +1,173 @@
|
|||
//! Hand-rolled BPF program emitter for seccomp filters.
|
||||
//!
|
||||
//! BPF instruction format from `<linux/filter.h>`:
|
||||
//!
|
||||
//! ```text
|
||||
//! struct sock_filter { u16 code; u8 jt; u8 jf; u32 k; }
|
||||
//! ```
|
||||
//!
|
||||
//! Only the ops Nyx needs to implement an AUDIT_ARCH check + per-syscall
|
||||
//! allowlist are defined. The output array is fed straight into
|
||||
//! `prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &program)`.
|
||||
|
||||
#[repr(C)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub struct SockFilter {
|
||||
pub code: u16,
|
||||
pub jt: u8,
|
||||
pub jf: u8,
|
||||
pub k: u32,
|
||||
}
|
||||
|
||||
#[repr(C)]
|
||||
pub struct SockFprog {
|
||||
pub len: u16,
|
||||
pub filter: *const SockFilter,
|
||||
}
|
||||
|
||||
// BPF opcode constants — see `linux/bpf_common.h`.
|
||||
pub const BPF_LD: u16 = 0x00;
|
||||
pub const BPF_W: u16 = 0x00;
|
||||
pub const BPF_ABS: u16 = 0x20;
|
||||
pub const BPF_JMP: u16 = 0x05;
|
||||
pub const BPF_JEQ: u16 = 0x10;
|
||||
pub const BPF_K: u16 = 0x00;
|
||||
pub const BPF_RET: u16 = 0x06;
|
||||
|
||||
// seccomp action constants — see `linux/seccomp.h`.
|
||||
pub const SECCOMP_RET_KILL_PROCESS: u32 = 0x8000_0000;
|
||||
pub const SECCOMP_RET_KILL: u32 = 0x0000_0000;
|
||||
pub const SECCOMP_RET_ALLOW: u32 = 0x7fff_0000;
|
||||
pub const SECCOMP_RET_ERRNO: u32 = 0x0005_0000;
|
||||
|
||||
// Offsets into `struct seccomp_data` from `linux/seccomp.h`:
|
||||
// nr (s32) at offset 0
|
||||
// arch (u32) at offset 4
|
||||
pub const SECCOMP_DATA_NR: u32 = 0;
|
||||
pub const SECCOMP_DATA_ARCH: u32 = 4;
|
||||
|
||||
/// Emit a BPF program implementing:
|
||||
///
|
||||
/// 1. Load `arch` from `seccomp_data`; if it does not match
|
||||
/// `audit_arch`, kill the process.
|
||||
/// 2. Load `nr` from `seccomp_data`.
|
||||
/// 3. For each `allowed_nr` in the table, jump to the ALLOW return.
|
||||
/// 4. Default: return KILL_PROCESS (or KILL on older kernels).
|
||||
///
|
||||
/// The instruction count is `5 + allowed_nrs.len()` (plus one for the
|
||||
/// final ALLOW return). Linux caps seccomp programs at 4096
|
||||
/// instructions; the realistic cap-per-finding allowlist is well under
|
||||
/// 100.
|
||||
pub fn compile(allowed_nrs: &[u32], audit_arch: u32) -> Vec<SockFilter> {
|
||||
let mut program: Vec<SockFilter> = Vec::with_capacity(allowed_nrs.len() + 8);
|
||||
|
||||
// (0) ld [arch]
|
||||
program.push(SockFilter {
|
||||
code: BPF_LD | BPF_W | BPF_ABS,
|
||||
jt: 0,
|
||||
jf: 0,
|
||||
k: SECCOMP_DATA_ARCH,
|
||||
});
|
||||
// (1) jeq audit_arch ? next : KILL
|
||||
// KILL is at the very end; computed below after we know the size.
|
||||
let arch_check_idx = program.len();
|
||||
program.push(SockFilter { code: BPF_JMP | BPF_JEQ | BPF_K, jt: 0, jf: 0, k: audit_arch });
|
||||
|
||||
// (2) ld [nr]
|
||||
program.push(SockFilter {
|
||||
code: BPF_LD | BPF_W | BPF_ABS,
|
||||
jt: 0,
|
||||
jf: 0,
|
||||
k: SECCOMP_DATA_NR,
|
||||
});
|
||||
|
||||
// (3..N) per-syscall jeq nr ? ALLOW : next
|
||||
// ALLOW is two instructions before KILL (we lay out:
|
||||
// ... checks ...
|
||||
// ret KILL
|
||||
// ret ALLOW
|
||||
// ). Each jeq jumps `(N - i - 1) + 1` (over the remaining checks
|
||||
// plus the KILL ret) to land on the ALLOW ret. Computed below.
|
||||
let first_check_idx = program.len();
|
||||
for &nr in allowed_nrs {
|
||||
program.push(SockFilter { code: BPF_JMP | BPF_JEQ | BPF_K, jt: 0, jf: 0, k: nr });
|
||||
}
|
||||
|
||||
// (KILL) ret KILL_PROCESS
|
||||
let kill_idx = program.len();
|
||||
program.push(SockFilter {
|
||||
code: BPF_RET | BPF_K,
|
||||
jt: 0,
|
||||
jf: 0,
|
||||
k: SECCOMP_RET_KILL_PROCESS,
|
||||
});
|
||||
// (ALLOW) ret ALLOW
|
||||
let allow_idx = program.len();
|
||||
program.push(SockFilter { code: BPF_RET | BPF_K, jt: 0, jf: 0, k: SECCOMP_RET_ALLOW });
|
||||
|
||||
// Patch arch check: jt=0 (next on match), jf=N (KILL on mismatch).
|
||||
let arch_jf = (kill_idx - arch_check_idx - 1) as u8;
|
||||
program[arch_check_idx].jf = arch_jf;
|
||||
|
||||
// Patch each per-syscall jeq: jt = jump to ALLOW, jf = fall through.
|
||||
for (i, nr_idx) in (first_check_idx..first_check_idx + allowed_nrs.len()).enumerate() {
|
||||
let _ = i;
|
||||
let jt = (allow_idx - nr_idx - 1) as u8;
|
||||
program[nr_idx].jt = jt;
|
||||
}
|
||||
|
||||
program
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn empty_allowlist_emits_arch_check_and_kill() {
|
||||
let prog = compile(&[], 0xc000_003e);
|
||||
// ld arch, jeq audit_arch, ld nr, ret KILL, ret ALLOW
|
||||
assert_eq!(prog.len(), 5);
|
||||
assert_eq!(prog[0].k, SECCOMP_DATA_ARCH);
|
||||
assert_eq!(prog[1].k, 0xc000_003e);
|
||||
assert_eq!(prog[2].k, SECCOMP_DATA_NR);
|
||||
assert_eq!(prog[3].k, SECCOMP_RET_KILL_PROCESS);
|
||||
assert_eq!(prog[4].k, SECCOMP_RET_ALLOW);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_syscall_allows_its_nr() {
|
||||
let prog = compile(&[42], 0xc000_003e);
|
||||
// ld arch, jeq audit_arch, ld nr, jeq 42, ret KILL, ret ALLOW
|
||||
assert_eq!(prog.len(), 6);
|
||||
let jeq = prog[3];
|
||||
assert_eq!(jeq.code, BPF_JMP | BPF_JEQ | BPF_K);
|
||||
assert_eq!(jeq.k, 42);
|
||||
// jt jumps over the KILL ret (1 inst) to land on ALLOW
|
||||
assert_eq!(jeq.jt, 1);
|
||||
assert_eq!(prog[4].k, SECCOMP_RET_KILL_PROCESS);
|
||||
assert_eq!(prog[5].k, SECCOMP_RET_ALLOW);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multi_syscall_jt_offsets_chain_to_allow() {
|
||||
let prog = compile(&[1, 2, 3], 0xc000_003e);
|
||||
// ld arch, jeq audit_arch, ld nr, jeq 1, jeq 2, jeq 3, KILL, ALLOW
|
||||
assert_eq!(prog.len(), 8);
|
||||
// jeq 1 at idx 3 → ALLOW at idx 7 → jt=7-3-1=3
|
||||
assert_eq!(prog[3].jt, 3);
|
||||
// jeq 2 at idx 4 → jt=7-4-1=2
|
||||
assert_eq!(prog[4].jt, 2);
|
||||
// jeq 3 at idx 5 → jt=7-5-1=1
|
||||
assert_eq!(prog[5].jt, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn arch_mismatch_jumps_to_kill() {
|
||||
let prog = compile(&[1, 2], 0xc000_003e);
|
||||
// ld arch (0), jeq arch (1), ld nr (2), jeq 1 (3), jeq 2 (4), KILL (5), ALLOW (6)
|
||||
// arch jeq jf must point to KILL → jf=5-1-1=3
|
||||
assert_eq!(prog[1].jf, 3);
|
||||
assert_eq!(prog[5].k, SECCOMP_RET_KILL_PROCESS);
|
||||
}
|
||||
}
|
||||
179
src/dynamic/sandbox/seccomp/mod.rs
Normal file
179
src/dynamic/sandbox/seccomp/mod.rs
Normal file
|
|
@ -0,0 +1,179 @@
|
|||
//! Phase 17 (Track E.1) — seccomp-bpf default-deny filter.
|
||||
//!
|
||||
//! [`apply_for_caps`] composes the cap-tagged allowlist baked from
|
||||
//! `seccomp_policy.toml` (via `build.rs`) into a BPF program and installs
|
||||
//! it via `prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &program)`. The
|
||||
//! filter is per-thread and inherited across `execve`, so the harness
|
||||
//! runs under it from the very first instruction of its image.
|
||||
//!
|
||||
//! Layout
|
||||
//! ------
|
||||
//! - `seccomp_policy.toml` — declarative cap → syscall table (the source
|
||||
//! of truth). `build.rs` parses it and emits an inline-includable Rust
|
||||
//! table to `OUT_DIR/seccomp_policy.rs`.
|
||||
//! - `bpf.rs` — minimal BPF instruction emitter (`compile()` returns a
|
||||
//! `Vec<SockFilter>`).
|
||||
//! - `syscalls.rs` — name → number map, x86_64 / aarch64.
|
||||
//!
|
||||
//! Design choices
|
||||
//! --------------
|
||||
//! - Default action is `SECCOMP_RET_KILL_PROCESS` so a denied syscall
|
||||
//! takes the whole harness down (loud failure, easy to tell apart from
|
||||
//! a normal sink hit).
|
||||
//! - Unknown syscall names from the policy are silently dropped — they
|
||||
//! can't be filtered without a number, and any kernel that recognises
|
||||
//! the name has the number too. Tests assert the policy round-trips.
|
||||
|
||||
pub mod bpf;
|
||||
pub mod syscalls;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
use crate::dynamic::sandbox::seccomp::bpf::{compile, SockFilter, SockFprog};
|
||||
use crate::dynamic::sandbox::seccomp::syscalls::{syscall_number, AUDIT_ARCH};
|
||||
|
||||
include!(concat!(env!("OUT_DIR"), "/seccomp_policy.rs"));
|
||||
|
||||
const PR_SET_NO_NEW_PRIVS: i32 = 38;
|
||||
const PR_SET_SECCOMP: i32 = 22;
|
||||
const SECCOMP_MODE_FILTER: u64 = 2;
|
||||
|
||||
unsafe extern "C" {
|
||||
fn prctl(option: i32, arg2: u64, arg3: u64, arg4: u64, arg5: u64) -> i32;
|
||||
fn __errno_location() -> *mut i32;
|
||||
}
|
||||
|
||||
/// Compose the cap-aware syscall allowlist: the `BASE` set unconditionally
|
||||
/// + every `CAP[i]` whose bit is set in `caps`. Names are deduped via a
|
||||
/// `BTreeSet` and resolved to numbers via [`syscall_number`]. Unknown
|
||||
/// names (not in the per-arch table) are silently dropped.
|
||||
pub fn allowed_syscall_numbers(caps: u32) -> Vec<u32> {
|
||||
let mut names: BTreeSet<&'static str> = BTreeSet::new();
|
||||
for &n in BASE.iter() {
|
||||
names.insert(n);
|
||||
}
|
||||
for &(bit, allowlist) in CAP.iter() {
|
||||
if caps & bit != 0 {
|
||||
for &n in allowlist.iter() {
|
||||
names.insert(n);
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut nrs: Vec<u32> = names.into_iter().filter_map(syscall_number).collect();
|
||||
nrs.sort_unstable();
|
||||
nrs.dedup();
|
||||
nrs
|
||||
}
|
||||
|
||||
/// Install a pre-compiled seccomp filter on the calling thread.
|
||||
///
|
||||
/// `program` MUST come from [`bpf::compile`]. Calls
|
||||
/// `prctl(PR_SET_NO_NEW_PRIVS)` first (a kernel prerequisite for
|
||||
/// unprivileged seccomp filter install) then
|
||||
/// `prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)`. Returns the
|
||||
/// underlying `io::Error` on failure.
|
||||
///
|
||||
/// Allocator-free: the function only borrows `program`, so the
|
||||
/// hardening pre_exec callback can use it without violating the
|
||||
/// post-fork allocator ban.
|
||||
pub fn install_compiled_filter(program: &[SockFilter]) -> std::io::Result<()> {
|
||||
if AUDIT_ARCH == 0 || program.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// PR_SET_NO_NEW_PRIVS = 1 is a kernel prerequisite for unprivileged
|
||||
// seccomp filter install. The Phase 17 hardening sequence already
|
||||
// calls it earlier, but installing here too is idempotent and
|
||||
// protects direct callers.
|
||||
let _ = unsafe { prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
|
||||
|
||||
let prog = SockFprog {
|
||||
len: program.len() as u16,
|
||||
filter: program.as_ptr(),
|
||||
};
|
||||
let ret = unsafe {
|
||||
prctl(
|
||||
PR_SET_SECCOMP,
|
||||
SECCOMP_MODE_FILTER,
|
||||
&prog as *const SockFprog as u64,
|
||||
0,
|
||||
0,
|
||||
)
|
||||
};
|
||||
if ret == 0 {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(std::io::Error::from_raw_os_error(unsafe {
|
||||
*__errno_location()
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
/// Convenience wrapper: compose the cap-aware allowlist via
|
||||
/// [`allowed_syscall_numbers`], compile a BPF program, and install it.
|
||||
/// Used by direct callers that don't pre-compile in the parent.
|
||||
pub fn apply_for_caps(caps: u32) -> std::io::Result<()> {
|
||||
if AUDIT_ARCH == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
let nrs = allowed_syscall_numbers(caps);
|
||||
let program: Vec<SockFilter> = compile(&nrs, AUDIT_ARCH);
|
||||
install_compiled_filter(&program)
|
||||
}
|
||||
|
||||
// ── Tests ────────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn base_table_is_non_empty() {
|
||||
assert!(!BASE.is_empty(), "seccomp BASE allowlist must include stdio + startup syscalls");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cap_table_includes_known_caps() {
|
||||
let known: Vec<&str> = CAP
|
||||
.iter()
|
||||
.map(|(_, _)| "_")
|
||||
.collect();
|
||||
// We declared SQL_QUERY, FILE_IO, SSRF, CODE_EXEC, HTML_ESCAPE,
|
||||
// DESERIALIZE, HEADER_INJECTION, OPEN_REDIRECT in the toml; the
|
||||
// build script emits one entry per `[cap.X]` table. The exact
|
||||
// count can grow as the policy grows; assert ≥ 4 so a future
|
||||
// accidental empty-policy regression is loud.
|
||||
assert!(known.len() >= 4, "CAP table emitted: {:?}", known.len());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn allowlist_deduplicates_overlapping_caps() {
|
||||
// SSRF and HEADER_INJECTION both allow `socket`; the deduped set
|
||||
// must contain it exactly once.
|
||||
let nrs = allowed_syscall_numbers(0);
|
||||
let mut sorted = nrs.clone();
|
||||
sorted.sort_unstable();
|
||||
sorted.dedup();
|
||||
assert_eq!(nrs.len(), sorted.len());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn caps_zero_returns_only_base() {
|
||||
let base = allowed_syscall_numbers(0);
|
||||
let with_caps = allowed_syscall_numbers(0xffff_ffff);
|
||||
assert!(base.len() <= with_caps.len());
|
||||
}
|
||||
|
||||
/// `BASE` includes `read` / `write` / `close` — the minimum the
|
||||
/// harness needs to print to stdout and exit cleanly.
|
||||
#[test]
|
||||
fn base_allows_stdio() {
|
||||
let nrs = allowed_syscall_numbers(0);
|
||||
let read = syscall_number("read").expect("read in syscall map");
|
||||
let write = syscall_number("write").expect("write in syscall map");
|
||||
let close = syscall_number("close").expect("close in syscall map");
|
||||
assert!(nrs.contains(&read));
|
||||
assert!(nrs.contains(&write));
|
||||
assert!(nrs.contains(&close));
|
||||
}
|
||||
}
|
||||
216
src/dynamic/sandbox/seccomp/seccomp_policy.toml
Normal file
216
src/dynamic/sandbox/seccomp/seccomp_policy.toml
Normal file
|
|
@ -0,0 +1,216 @@
|
|||
# Phase 17 (Track E.1) — seccomp-bpf default-deny allowlist.
|
||||
#
|
||||
# Format
|
||||
# ------
|
||||
# Each `[base]` syscall is allowed unconditionally (every harness needs
|
||||
# them for stdio + interpreter / runtime startup). Each `[cap.<NAME>]`
|
||||
# table adds syscalls allowed only when that `Cap` bit is set in
|
||||
# `SandboxOptions::seccomp_caps`. Unknown / unset caps fall back to the
|
||||
# base list, so a finding with no cap-aware needs runs with the strictest
|
||||
# possible filter.
|
||||
#
|
||||
# `<NAME>` must match a `Cap::*` const declared in `src/labels/mod.rs`.
|
||||
# The list of known names is mirrored in `build.rs::CAP_BIT_FOR_NAME`;
|
||||
# add the bit value alongside the const when extending [`Cap`].
|
||||
#
|
||||
# Build-time codegen
|
||||
# ------------------
|
||||
# `build.rs` reads this file and emits `OUT_DIR/seccomp_policy.rs`
|
||||
# containing two `&'static [&'static str]` tables (`BASE` + `CAP`).
|
||||
# Runtime then maps the syscall names to x86_64 / aarch64 numbers via
|
||||
# `syscalls.rs` and compiles a BPF program per cap-bits.
|
||||
|
||||
[base]
|
||||
allow = [
|
||||
"read",
|
||||
"write",
|
||||
"writev",
|
||||
"readv",
|
||||
"close",
|
||||
"fstat",
|
||||
"lseek",
|
||||
"lstat",
|
||||
"stat",
|
||||
"newfstatat",
|
||||
"statx",
|
||||
"mmap",
|
||||
"mremap",
|
||||
"munmap",
|
||||
"brk",
|
||||
"rt_sigaction",
|
||||
"rt_sigreturn",
|
||||
"rt_sigprocmask",
|
||||
"sigaltstack",
|
||||
"exit",
|
||||
"exit_group",
|
||||
"futex",
|
||||
"set_robust_list",
|
||||
"get_robust_list",
|
||||
"getrandom",
|
||||
"getpid",
|
||||
"gettid",
|
||||
"getuid",
|
||||
"geteuid",
|
||||
"getgid",
|
||||
"getegid",
|
||||
"clock_gettime",
|
||||
"clock_getres",
|
||||
"clock_nanosleep",
|
||||
"nanosleep",
|
||||
"ioctl",
|
||||
"fcntl",
|
||||
"dup",
|
||||
"dup2",
|
||||
"dup3",
|
||||
"pipe",
|
||||
"pipe2",
|
||||
"uname",
|
||||
"arch_prctl",
|
||||
"prlimit64",
|
||||
"getrlimit",
|
||||
"set_tid_address",
|
||||
"rseq",
|
||||
"madvise",
|
||||
"mprotect",
|
||||
"epoll_create1",
|
||||
"epoll_ctl",
|
||||
"epoll_wait",
|
||||
"epoll_pwait",
|
||||
"poll",
|
||||
"ppoll",
|
||||
"select",
|
||||
"pselect6",
|
||||
"wait4",
|
||||
"waitid",
|
||||
"tgkill",
|
||||
"kill",
|
||||
"openat",
|
||||
"open",
|
||||
"access",
|
||||
"faccessat",
|
||||
"faccessat2",
|
||||
"readlink",
|
||||
"readlinkat",
|
||||
"getcwd",
|
||||
"getdents",
|
||||
"getdents64",
|
||||
"sched_getaffinity",
|
||||
"sched_setaffinity",
|
||||
"sched_yield",
|
||||
"prctl",
|
||||
"membarrier",
|
||||
]
|
||||
|
||||
[cap.SQL_QUERY]
|
||||
# SQLite / driver paths use lock + truncate + sync ops on top of the base
|
||||
# openat / read / write set.
|
||||
allow = [
|
||||
"fdatasync",
|
||||
"fsync",
|
||||
"fallocate",
|
||||
"ftruncate",
|
||||
"flock",
|
||||
"pread64",
|
||||
"pwrite64",
|
||||
]
|
||||
|
||||
[cap.FILE_IO]
|
||||
# File reads + directory walks need the dirfd / xattr / link family on
|
||||
# top of the base set.
|
||||
allow = [
|
||||
"pread64",
|
||||
"pwrite64",
|
||||
"readlinkat",
|
||||
"linkat",
|
||||
"symlinkat",
|
||||
"unlinkat",
|
||||
"mkdirat",
|
||||
"renameat",
|
||||
"renameat2",
|
||||
"utimensat",
|
||||
"fchmod",
|
||||
"fchown",
|
||||
"fchmodat",
|
||||
"fchownat",
|
||||
"getxattr",
|
||||
"fgetxattr",
|
||||
"lgetxattr",
|
||||
"listxattr",
|
||||
"flistxattr",
|
||||
"llistxattr",
|
||||
"copy_file_range",
|
||||
"sendfile",
|
||||
]
|
||||
|
||||
[cap.SSRF]
|
||||
# Outbound HTTP needs the socket / connect / TLS handshake set.
|
||||
allow = [
|
||||
"socket",
|
||||
"connect",
|
||||
"sendto",
|
||||
"recvfrom",
|
||||
"sendmsg",
|
||||
"recvmsg",
|
||||
"shutdown",
|
||||
"getsockname",
|
||||
"getpeername",
|
||||
"getsockopt",
|
||||
"setsockopt",
|
||||
"bind",
|
||||
"listen",
|
||||
"accept",
|
||||
"accept4",
|
||||
]
|
||||
|
||||
[cap.CODE_EXEC]
|
||||
# `subprocess.run(...)` / `os.system(...)` payloads need fork + exec.
|
||||
allow = [
|
||||
"clone",
|
||||
"clone3",
|
||||
"fork",
|
||||
"vfork",
|
||||
"execve",
|
||||
"execveat",
|
||||
"wait4",
|
||||
"waitid",
|
||||
]
|
||||
|
||||
[cap.HTML_ESCAPE]
|
||||
# Pure-CPU sanitizer paths need only the base set; this entry exists so
|
||||
# the build-time codegen sees the cap and emits an explicit table even
|
||||
# when the allowlist is empty.
|
||||
allow = []
|
||||
|
||||
[cap.DESERIALIZE]
|
||||
# pickle / Marshal / unserialize paths typically only need the base I/O
|
||||
# set; codegen-only entry.
|
||||
allow = []
|
||||
|
||||
[cap.HEADER_INJECTION]
|
||||
# CRLF-sensitive header sinks share the SSRF socket family.
|
||||
allow = [
|
||||
"socket",
|
||||
"connect",
|
||||
"sendto",
|
||||
"recvfrom",
|
||||
"sendmsg",
|
||||
"recvmsg",
|
||||
"getsockname",
|
||||
"getpeername",
|
||||
"getsockopt",
|
||||
"setsockopt",
|
||||
]
|
||||
|
||||
[cap.OPEN_REDIRECT]
|
||||
allow = [
|
||||
"socket",
|
||||
"connect",
|
||||
"sendto",
|
||||
"recvfrom",
|
||||
"sendmsg",
|
||||
"recvmsg",
|
||||
"getsockname",
|
||||
"getpeername",
|
||||
"getsockopt",
|
||||
"setsockopt",
|
||||
]
|
||||
291
src/dynamic/sandbox/seccomp/syscalls.rs
Normal file
291
src/dynamic/sandbox/seccomp/syscalls.rs
Normal file
|
|
@ -0,0 +1,291 @@
|
|||
//! Syscall name → number map for the architectures Nyx's Linux process
|
||||
//! backend supports. Only the names referenced by
|
||||
//! `seccomp_policy.toml` need to be present; unknown names are silently
|
||||
//! dropped from the BPF allowlist (they cannot be filtered if they have
|
||||
//! no number).
|
||||
//!
|
||||
//! Numbers are pulled from `<asm/unistd_64.h>` (x86_64) and
|
||||
//! `<asm-generic/unistd.h>` (aarch64). When a syscall exists on one
|
||||
//! arch but not the other (e.g. `arch_prctl` on aarch64), the entry is
|
||||
//! omitted on the missing arch and the seccomp filter naturally falls
|
||||
//! through to the deny rule there.
|
||||
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
pub fn syscall_number(name: &str) -> Option<u32> {
|
||||
let n = match name {
|
||||
"read" => 0,
|
||||
"write" => 1,
|
||||
"open" => 2,
|
||||
"close" => 3,
|
||||
"stat" => 4,
|
||||
"fstat" => 5,
|
||||
"lstat" => 6,
|
||||
"poll" => 7,
|
||||
"lseek" => 8,
|
||||
"mmap" => 9,
|
||||
"mprotect" => 10,
|
||||
"munmap" => 11,
|
||||
"brk" => 12,
|
||||
"rt_sigaction" => 13,
|
||||
"rt_sigprocmask" => 14,
|
||||
"rt_sigreturn" => 15,
|
||||
"ioctl" => 16,
|
||||
"pread64" => 17,
|
||||
"pwrite64" => 18,
|
||||
"readv" => 19,
|
||||
"writev" => 20,
|
||||
"access" => 21,
|
||||
"pipe" => 22,
|
||||
"select" => 23,
|
||||
"sched_yield" => 24,
|
||||
"mremap" => 25,
|
||||
"madvise" => 28,
|
||||
"dup" => 32,
|
||||
"dup2" => 33,
|
||||
"nanosleep" => 35,
|
||||
"getpid" => 39,
|
||||
"sendfile" => 40,
|
||||
"socket" => 41,
|
||||
"connect" => 42,
|
||||
"accept" => 43,
|
||||
"sendto" => 44,
|
||||
"recvfrom" => 45,
|
||||
"sendmsg" => 46,
|
||||
"recvmsg" => 47,
|
||||
"shutdown" => 48,
|
||||
"bind" => 49,
|
||||
"listen" => 50,
|
||||
"getsockname" => 51,
|
||||
"getpeername" => 52,
|
||||
"setsockopt" => 54,
|
||||
"getsockopt" => 55,
|
||||
"clone" => 56,
|
||||
"fork" => 57,
|
||||
"vfork" => 58,
|
||||
"execve" => 59,
|
||||
"exit" => 60,
|
||||
"wait4" => 61,
|
||||
"kill" => 62,
|
||||
"uname" => 63,
|
||||
"fcntl" => 72,
|
||||
"flock" => 73,
|
||||
"fsync" => 74,
|
||||
"fdatasync" => 75,
|
||||
"ftruncate" => 77,
|
||||
"getdents" => 78,
|
||||
"getcwd" => 79,
|
||||
"readlink" => 89,
|
||||
"fchmod" => 91,
|
||||
"fchown" => 93,
|
||||
"getuid" => 102,
|
||||
"getgid" => 104,
|
||||
"geteuid" => 107,
|
||||
"getegid" => 108,
|
||||
"sigaltstack" => 131,
|
||||
"arch_prctl" => 158,
|
||||
"gettid" => 186,
|
||||
"futex" => 202,
|
||||
"sched_setaffinity" => 203,
|
||||
"sched_getaffinity" => 204,
|
||||
"epoll_create" => 213,
|
||||
"getdents64" => 217,
|
||||
"set_tid_address" => 218,
|
||||
"fadvise64" => 221,
|
||||
"clock_gettime" => 228,
|
||||
"clock_getres" => 229,
|
||||
"clock_nanosleep" => 230,
|
||||
"exit_group" => 231,
|
||||
"epoll_wait" => 232,
|
||||
"epoll_ctl" => 233,
|
||||
"tgkill" => 234,
|
||||
"waitid" => 247,
|
||||
"openat" => 257,
|
||||
"mkdirat" => 258,
|
||||
"newfstatat" => 262,
|
||||
"unlinkat" => 263,
|
||||
"renameat" => 264,
|
||||
"linkat" => 265,
|
||||
"symlinkat" => 266,
|
||||
"readlinkat" => 267,
|
||||
"fchmodat" => 268,
|
||||
"faccessat" => 269,
|
||||
"pselect6" => 270,
|
||||
"ppoll" => 271,
|
||||
"fallocate" => 285,
|
||||
"utimensat" => 280,
|
||||
"epoll_pwait" => 281,
|
||||
"accept4" => 288,
|
||||
"pipe2" => 293,
|
||||
"epoll_create1" => 291,
|
||||
"dup3" => 292,
|
||||
"prlimit64" => 302,
|
||||
"getrandom" => 318,
|
||||
"membarrier" => 324,
|
||||
"renameat2" => 316,
|
||||
"copy_file_range" => 326,
|
||||
"execveat" => 322,
|
||||
"rseq" => 334,
|
||||
"clone3" => 435,
|
||||
"faccessat2" => 439,
|
||||
"statx" => 332,
|
||||
"set_robust_list" => 273,
|
||||
"get_robust_list" => 274,
|
||||
"fchownat" => 260,
|
||||
"getxattr" => 191,
|
||||
"lgetxattr" => 192,
|
||||
"fgetxattr" => 193,
|
||||
"listxattr" => 194,
|
||||
"llistxattr" => 195,
|
||||
"flistxattr" => 196,
|
||||
"prctl" => 157,
|
||||
"getrlimit" => 97,
|
||||
_ => return None,
|
||||
};
|
||||
Some(n)
|
||||
}
|
||||
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
pub fn syscall_number(name: &str) -> Option<u32> {
|
||||
let n = match name {
|
||||
// generic numbers (asm-generic/unistd.h)
|
||||
"io_setup" => 0,
|
||||
"getcwd" => 17,
|
||||
"lookup_dcookie" => 18,
|
||||
"eventfd2" => 19,
|
||||
"epoll_create1" => 20,
|
||||
"epoll_ctl" => 21,
|
||||
"epoll_pwait" => 22,
|
||||
"dup" => 23,
|
||||
"dup3" => 24,
|
||||
"fcntl" => 25,
|
||||
"ioctl" => 29,
|
||||
"flock" => 32,
|
||||
"mkdirat" => 34,
|
||||
"unlinkat" => 35,
|
||||
"symlinkat" => 36,
|
||||
"linkat" => 37,
|
||||
"renameat" => 38,
|
||||
"fallocate" => 47,
|
||||
"faccessat" => 48,
|
||||
"chdir" => 49,
|
||||
"openat" => 56,
|
||||
"close" => 57,
|
||||
"pipe2" => 59,
|
||||
"getdents64" => 61,
|
||||
"lseek" => 62,
|
||||
"read" => 63,
|
||||
"write" => 64,
|
||||
"readv" => 65,
|
||||
"writev" => 66,
|
||||
"pread64" => 67,
|
||||
"pwrite64" => 68,
|
||||
"ppoll" => 73,
|
||||
"pselect6" => 72,
|
||||
"sendfile" => 71,
|
||||
"fdatasync" => 83,
|
||||
"fsync" => 82,
|
||||
"ftruncate" => 46,
|
||||
"newfstatat" => 79,
|
||||
"fstat" => 80,
|
||||
"exit" => 93,
|
||||
"exit_group" => 94,
|
||||
"waitid" => 95,
|
||||
"set_tid_address" => 96,
|
||||
"futex" => 98,
|
||||
"set_robust_list" => 99,
|
||||
"get_robust_list" => 100,
|
||||
"nanosleep" => 101,
|
||||
"getpid" => 172,
|
||||
"gettid" => 178,
|
||||
"uname" => 160,
|
||||
"kill" => 129,
|
||||
"tgkill" => 131,
|
||||
"rt_sigaction" => 134,
|
||||
"rt_sigprocmask" => 135,
|
||||
"rt_sigreturn" => 139,
|
||||
"sigaltstack" => 132,
|
||||
"getrandom" => 278,
|
||||
"membarrier" => 283,
|
||||
"renameat2" => 276,
|
||||
"copy_file_range" => 285,
|
||||
"statx" => 291,
|
||||
"execveat" => 281,
|
||||
"rseq" => 293,
|
||||
"clone3" => 435,
|
||||
"faccessat2" => 439,
|
||||
"epoll_pwait2" => 441,
|
||||
"rt_sigtimedwait" => 137,
|
||||
"rt_sigsuspend" => 133,
|
||||
"clone" => 220,
|
||||
"execve" => 221,
|
||||
"mmap" => 222,
|
||||
"fadvise64" => 223,
|
||||
"mprotect" => 226,
|
||||
"msync" => 227,
|
||||
"mlock" => 228,
|
||||
"munlock" => 229,
|
||||
"munmap" => 215,
|
||||
"brk" => 214,
|
||||
"mremap" => 216,
|
||||
"madvise" => 233,
|
||||
"wait4" => 260,
|
||||
"prlimit64" => 261,
|
||||
"getrlimit" => 163,
|
||||
"prctl" => 167,
|
||||
"fchmod" => 52,
|
||||
"fchmodat" => 53,
|
||||
"fchown" => 55,
|
||||
"fchownat" => 54,
|
||||
"getuid" => 174,
|
||||
"geteuid" => 175,
|
||||
"getgid" => 176,
|
||||
"getegid" => 177,
|
||||
"socket" => 198,
|
||||
"bind" => 200,
|
||||
"listen" => 201,
|
||||
"accept" => 202,
|
||||
"connect" => 203,
|
||||
"getsockname" => 204,
|
||||
"getpeername" => 205,
|
||||
"sendto" => 206,
|
||||
"recvfrom" => 207,
|
||||
"setsockopt" => 208,
|
||||
"getsockopt" => 209,
|
||||
"shutdown" => 210,
|
||||
"sendmsg" => 211,
|
||||
"recvmsg" => 212,
|
||||
"accept4" => 242,
|
||||
"sched_setaffinity" => 122,
|
||||
"sched_getaffinity" => 123,
|
||||
"sched_yield" => 124,
|
||||
"clock_gettime" => 113,
|
||||
"clock_getres" => 114,
|
||||
"clock_nanosleep" => 115,
|
||||
"epoll_create" => 20, // alias to epoll_create1 on generic
|
||||
"epoll_wait" => 22, // alias to epoll_pwait on generic
|
||||
"openat2" => 437,
|
||||
"readlinkat" => 78,
|
||||
"utimensat" => 88,
|
||||
"getxattr" => 8,
|
||||
"lgetxattr" => 9,
|
||||
"fgetxattr" => 10,
|
||||
"listxattr" => 11,
|
||||
"llistxattr" => 12,
|
||||
"flistxattr" => 13,
|
||||
_ => return None,
|
||||
};
|
||||
Some(n)
|
||||
}
|
||||
|
||||
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
|
||||
pub fn syscall_number(_name: &str) -> Option<u32> {
|
||||
None
|
||||
}
|
||||
|
||||
/// AUDIT_ARCH constant matching the running architecture.
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
pub const AUDIT_ARCH: u32 = 0xc000_003e;
|
||||
#[cfg(target_arch = "aarch64")]
|
||||
pub const AUDIT_ARCH: u32 = 0xc000_00b7;
|
||||
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
|
||||
pub const AUDIT_ARCH: u32 = 0;
|
||||
124
tests/dynamic_fixtures/hardening/probe.c
Normal file
124
tests/dynamic_fixtures/hardening/probe.c
Normal file
|
|
@ -0,0 +1,124 @@
|
|||
/*
|
||||
* Phase 17 (Track E.1) — process-backend hardening probe.
|
||||
*
|
||||
* Linked statically (no glibc dynamic loader needed) so it runs after
|
||||
* `chroot(workdir)` strips access to /usr/lib. Reads its own
|
||||
* `/proc/self` view to determine which Phase 17 primitives applied,
|
||||
* then prints a structured `key:value` line per primitive. The Rust
|
||||
* test reads stdout and asserts on each line.
|
||||
*
|
||||
* The probe is also reused by the path-traversal case: when
|
||||
* `argv[1] == "traverse"` it tries to open `/etc/passwd` and reports
|
||||
* either `chroot blocked` (open failed) or `chroot escaped` (open
|
||||
* succeeded, host file visible).
|
||||
*
|
||||
* Built at test runtime with `cc -static -O2 -o probe probe.c`. Test
|
||||
* skips with an eprintln! when the host has no `cc` or no static glibc.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <sys/resource.h>
|
||||
#include <sys/stat.h>
|
||||
#include <errno.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
static void grep_status(const char *needle, const char *fallback) {
|
||||
FILE *f = fopen("/proc/self/status", "r");
|
||||
if (!f) {
|
||||
printf("%s%s\n", needle, fallback);
|
||||
return;
|
||||
}
|
||||
char line[512];
|
||||
int found = 0;
|
||||
while (fgets(line, sizeof(line), f)) {
|
||||
if (strncmp(line, needle, strlen(needle)) == 0) {
|
||||
// Strip trailing newline.
|
||||
size_t n = strlen(line);
|
||||
if (n && line[n - 1] == '\n') line[n - 1] = '\0';
|
||||
printf("%s\n", line);
|
||||
found = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found) printf("%s%s\n", needle, fallback);
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
static void print_rlimit(const char *tag, int resource) {
|
||||
struct rlimit rl;
|
||||
if (getrlimit(resource, &rl) == 0) {
|
||||
printf("%s:%llu/%llu\n", tag,
|
||||
(unsigned long long)rl.rlim_cur,
|
||||
(unsigned long long)rl.rlim_max);
|
||||
} else {
|
||||
printf("%s:err\n", tag);
|
||||
}
|
||||
}
|
||||
|
||||
static void probe_namespaces(void) {
|
||||
// /proc/self/ns/user, /proc/self/ns/pid, /proc/self/ns/mnt are
|
||||
// symlinks like `user:[4026531837]`. We read the link target and
|
||||
// print the inode-id portion.
|
||||
const char *names[] = {"user", "pid", "mnt"};
|
||||
for (int i = 0; i < 3; i++) {
|
||||
char path[64];
|
||||
char target[256];
|
||||
snprintf(path, sizeof(path), "/proc/self/ns/%s", names[i]);
|
||||
ssize_t n = readlink(path, target, sizeof(target) - 1);
|
||||
if (n > 0) {
|
||||
target[n] = '\0';
|
||||
printf("ns_%s:%s\n", names[i], target);
|
||||
} else {
|
||||
printf("ns_%s:err\n", names[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void probe_chroot(void) {
|
||||
// After chroot(workdir), `/etc/passwd` should not exist (the harness
|
||||
// workdir does not contain /etc). Open + ENOENT means chroot held.
|
||||
int fd = open("/etc/passwd", O_RDONLY);
|
||||
if (fd < 0) {
|
||||
printf("chroot:blocked errno=%d\n", errno);
|
||||
} else {
|
||||
char buf[64];
|
||||
ssize_t n = read(fd, buf, sizeof(buf) - 1);
|
||||
close(fd);
|
||||
if (n > 0) {
|
||||
buf[n] = '\0';
|
||||
printf("chroot:escaped read=%zd\n", n);
|
||||
} else {
|
||||
printf("chroot:escaped read=0\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
grep_status("NoNewPrivs:", "\t?");
|
||||
grep_status("Seccomp:", "\t?");
|
||||
print_rlimit("rlimit_as", RLIMIT_AS);
|
||||
print_rlimit("rlimit_cpu", RLIMIT_CPU);
|
||||
print_rlimit("rlimit_nofile", RLIMIT_NOFILE);
|
||||
probe_namespaces();
|
||||
probe_chroot();
|
||||
|
||||
if (argc > 1 && strcmp(argv[1], "traverse") == 0) {
|
||||
// Path-traversal acceptance case: a payload that tries to read
|
||||
// /etc/passwd outside the workdir. Exit non-zero so the verifier
|
||||
// records NotConfirmed; the probe-level "chroot blocked" line
|
||||
// already printed above is what the test asserts on.
|
||||
if (open("/etc/passwd", O_RDONLY) >= 0) {
|
||||
// chroot did not hold — exit 0 to signal escape (test fails).
|
||||
printf("traverse:escaped\n");
|
||||
return 0;
|
||||
}
|
||||
printf("traverse:blocked\n");
|
||||
return 7;
|
||||
}
|
||||
|
||||
printf("__NYX_PROBE_DONE__\n");
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -58,12 +58,8 @@ mod escape_tests {
|
|||
timeout: Duration::from_secs(10),
|
||||
memory_mib: 256,
|
||||
backend: SandboxBackend::Docker,
|
||||
env_passthrough: vec![],
|
||||
output_limit: 65536,
|
||||
network_policy: NetworkPolicy::None,
|
||||
probe_channel: None,
|
||||
extra_env: vec![],
|
||||
stub_harness: None,
|
||||
..SandboxOptions::default()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
478
tests/sandbox_hardening_linux.rs
Normal file
478
tests/sandbox_hardening_linux.rs
Normal file
|
|
@ -0,0 +1,478 @@
|
|||
//! Phase 17 (Track E.1) — Linux process backend hardening acceptance tests.
|
||||
//!
|
||||
//! Each primitive in the Phase 17 sequence is exercised against a
|
||||
//! statically-linked C probe (`tests/dynamic_fixtures/hardening/probe.c`)
|
||||
//! that prints its own `/proc/self` view to stdout. The Rust test reads
|
||||
//! stdout back and asserts on the expected line per primitive.
|
||||
//!
|
||||
//! The probe is built once per test run via `cc -static -O2`. Hosts
|
||||
//! without `cc` or without a static-link-capable libc skip with an
|
||||
//! `eprintln!` rather than failing — the suite's authoritative gate is
|
||||
//! the Linux CI matrix row that has both.
|
||||
//!
|
||||
//! Run with:
|
||||
//! `cargo nextest run --features dynamic --test sandbox_hardening_linux`
|
||||
|
||||
#[cfg(all(feature = "dynamic", target_os = "linux"))]
|
||||
mod hardening_tests {
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::process::Command;
|
||||
use std::sync::OnceLock;
|
||||
use std::time::Duration;
|
||||
|
||||
use nyx_scanner::dynamic::harness::BuiltHarness;
|
||||
use nyx_scanner::dynamic::sandbox::process_linux::{
|
||||
last_hardening_outcome, reset_last_hardening_outcome, HardeningLevel, PrimitiveStatus,
|
||||
};
|
||||
use nyx_scanner::dynamic::sandbox::seccomp;
|
||||
use nyx_scanner::dynamic::sandbox::{
|
||||
self, ProcessHardeningProfile, SandboxBackend, SandboxOptions,
|
||||
};
|
||||
|
||||
// ── Probe build ───────────────────────────────────────────────────────────
|
||||
|
||||
/// Path to the freshly-built probe binary, shared across every test.
|
||||
static PROBE_BINARY: OnceLock<Option<PathBuf>> = OnceLock::new();
|
||||
|
||||
fn probe_path() -> Option<&'static Path> {
|
||||
PROBE_BINARY
|
||||
.get_or_init(|| build_probe_once())
|
||||
.as_deref()
|
||||
}
|
||||
|
||||
fn build_probe_once() -> Option<PathBuf> {
|
||||
let cc = std::env::var("CC").unwrap_or_else(|_| "cc".to_owned());
|
||||
let src = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
|
||||
.join("tests/dynamic_fixtures/hardening/probe.c");
|
||||
let out_dir = std::env::temp_dir().join("nyx-hardening-probe");
|
||||
let _ = std::fs::create_dir_all(&out_dir);
|
||||
let out_bin = out_dir.join("probe");
|
||||
|
||||
// Try a static link first (works under glibc-dev with libc.a, or
|
||||
// musl-cross). Fall back to dynamic if that fails — the probe
|
||||
// still functions before chroot but the chroot test will skip.
|
||||
let static_status = Command::new(&cc)
|
||||
.args(["-static", "-O2", "-o"])
|
||||
.arg(&out_bin)
|
||||
.arg(&src)
|
||||
.status();
|
||||
if matches!(&static_status, Ok(s) if s.success()) {
|
||||
return Some(out_bin);
|
||||
}
|
||||
|
||||
let dyn_status = Command::new(&cc)
|
||||
.args(["-O2", "-o"])
|
||||
.arg(&out_bin)
|
||||
.arg(&src)
|
||||
.status();
|
||||
if matches!(&dyn_status, Ok(s) if s.success()) {
|
||||
// Mark via env so the chroot test can branch.
|
||||
unsafe { std::env::set_var("NYX_PROBE_DYNAMIC", "1") };
|
||||
return Some(out_bin);
|
||||
}
|
||||
|
||||
eprintln!(
|
||||
"SKIP: could not build hardening probe with {cc:?} (static={static_status:?}, \
|
||||
dyn={dyn_status:?})"
|
||||
);
|
||||
None
|
||||
}
|
||||
|
||||
fn probe_is_static() -> bool {
|
||||
std::env::var_os("NYX_PROBE_DYNAMIC").is_none()
|
||||
}
|
||||
|
||||
// ── Sandbox helpers ───────────────────────────────────────────────────────
|
||||
|
||||
fn strict_opts() -> SandboxOptions {
|
||||
SandboxOptions {
|
||||
timeout: Duration::from_secs(10),
|
||||
memory_mib: 256,
|
||||
backend: SandboxBackend::Process,
|
||||
output_limit: 65536,
|
||||
process_hardening: ProcessHardeningProfile::Strict,
|
||||
// Keep seccomp_caps = 0 so only the BASE allowlist applies:
|
||||
// the probe needs `read`, `write`, `openat`, `readlink`, etc.,
|
||||
// all of which are in the base set.
|
||||
seccomp_caps: 0,
|
||||
..SandboxOptions::default()
|
||||
}
|
||||
}
|
||||
|
||||
fn standard_opts() -> SandboxOptions {
|
||||
SandboxOptions {
|
||||
timeout: Duration::from_secs(10),
|
||||
memory_mib: 256,
|
||||
backend: SandboxBackend::Process,
|
||||
output_limit: 65536,
|
||||
process_hardening: ProcessHardeningProfile::Standard,
|
||||
..SandboxOptions::default()
|
||||
}
|
||||
}
|
||||
|
||||
fn build_harness_with_probe(workdir: &Path, args: &[&str]) -> BuiltHarness {
|
||||
// Stage the probe inside the workdir so `chroot(workdir)` doesn't
|
||||
// leave the binary unreachable mid-exec.
|
||||
let probe_src = probe_path().expect("probe must be built").to_path_buf();
|
||||
let probe_dst = workdir.join("probe");
|
||||
std::fs::copy(&probe_src, &probe_dst).expect("copy probe into workdir");
|
||||
// Ensure it's executable (cc preserves +x but be explicit).
|
||||
use std::os::unix::fs::PermissionsExt;
|
||||
let mut perms = std::fs::metadata(&probe_dst).unwrap().permissions();
|
||||
perms.set_mode(0o755);
|
||||
std::fs::set_permissions(&probe_dst, perms).unwrap();
|
||||
|
||||
let mut command: Vec<String> = vec![probe_dst.to_string_lossy().into_owned()];
|
||||
for a in args {
|
||||
command.push((*a).to_string());
|
||||
}
|
||||
|
||||
BuiltHarness {
|
||||
workdir: workdir.to_path_buf(),
|
||||
command,
|
||||
env: vec![],
|
||||
source: String::new(),
|
||||
entry_source: String::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn workdir() -> tempfile::TempDir {
|
||||
tempfile::TempDir::new().expect("temp dir")
|
||||
}
|
||||
|
||||
fn stdout_string(out: &sandbox::SandboxOutcome) -> String {
|
||||
String::from_utf8_lossy(&out.stdout).into_owned()
|
||||
}
|
||||
|
||||
fn assert_line(stdout: &str, prefix: &str) {
|
||||
assert!(
|
||||
stdout.lines().any(|l| l.starts_with(prefix)),
|
||||
"expected stdout to contain a line starting with {prefix:?}; full stdout:\n{stdout}"
|
||||
);
|
||||
}
|
||||
|
||||
// ── Tests ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Sanity gate: the probe must build and run on a Confirmed
|
||||
/// (exit-zero) baseline. All other tests presume this passes.
|
||||
#[test]
|
||||
fn probe_runs_under_strict_profile() {
|
||||
let Some(_) = probe_path() else { return };
|
||||
let tmp = workdir();
|
||||
let harness = build_harness_with_probe(tmp.path(), &[]);
|
||||
let opts = strict_opts();
|
||||
reset_last_hardening_outcome();
|
||||
let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run");
|
||||
let stdout = stdout_string(&result);
|
||||
eprintln!("probe stdout under strict:\n{stdout}");
|
||||
// Probe always prints a `__NYX_PROBE_DONE__` sentinel after the
|
||||
// primitive lines; absence means the binary died before reaching
|
||||
// the end (e.g. seccomp killed it). A clean Confirmed run prints
|
||||
// it.
|
||||
assert_line(&stdout, "__NYX_PROBE_DONE__");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_new_privs_set_under_strict() {
|
||||
let Some(_) = probe_path() else { return };
|
||||
let tmp = workdir();
|
||||
let harness = build_harness_with_probe(tmp.path(), &[]);
|
||||
let opts = strict_opts();
|
||||
let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run");
|
||||
let stdout = stdout_string(&result);
|
||||
// /proc/self/status's `NoNewPrivs:` line is `1` after PR_SET_NO_NEW_PRIVS.
|
||||
assert!(
|
||||
stdout.contains("NoNewPrivs:\t1"),
|
||||
"expected NoNewPrivs:1 line; full stdout:\n{stdout}"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rlimit_cpu_capped_under_strict() {
|
||||
let Some(_) = probe_path() else { return };
|
||||
let tmp = workdir();
|
||||
let harness = build_harness_with_probe(tmp.path(), &[]);
|
||||
let opts = strict_opts();
|
||||
let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run");
|
||||
let stdout = stdout_string(&result);
|
||||
// RLIMIT_CPU is set to timeout * 2 = 20 seconds in strict_opts.
|
||||
// Under Standard the value would be RLIM_INFINITY.
|
||||
assert_line(&stdout, "rlimit_cpu:");
|
||||
for line in stdout.lines() {
|
||||
if let Some(rest) = line.strip_prefix("rlimit_cpu:") {
|
||||
let (cur, _) = rest.split_once('/').expect("rlimit_cpu format");
|
||||
let cur: u64 = cur.parse().expect("numeric rlimit");
|
||||
assert!(cur <= 30, "RLIMIT_CPU not capped: {cur}");
|
||||
return;
|
||||
}
|
||||
}
|
||||
panic!("rlimit_cpu line missing from stdout:\n{stdout}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rlimit_nofile_capped_under_strict() {
|
||||
let Some(_) = probe_path() else { return };
|
||||
let tmp = workdir();
|
||||
let harness = build_harness_with_probe(tmp.path(), &[]);
|
||||
let opts = strict_opts();
|
||||
let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run");
|
||||
let stdout = stdout_string(&result);
|
||||
for line in stdout.lines() {
|
||||
if let Some(rest) = line.strip_prefix("rlimit_nofile:") {
|
||||
let (cur, _) = rest.split_once('/').expect("rlimit_nofile format");
|
||||
let cur: u64 = cur.parse().expect("numeric rlimit");
|
||||
assert!(cur <= 256, "RLIMIT_NOFILE not capped: {cur}");
|
||||
return;
|
||||
}
|
||||
}
|
||||
panic!("rlimit_nofile line missing from stdout:\n{stdout}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rlimit_as_capped_under_strict() {
|
||||
let Some(_) = probe_path() else { return };
|
||||
let tmp = workdir();
|
||||
let harness = build_harness_with_probe(tmp.path(), &[]);
|
||||
let opts = strict_opts();
|
||||
let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run");
|
||||
let stdout = stdout_string(&result);
|
||||
for line in stdout.lines() {
|
||||
if let Some(rest) = line.strip_prefix("rlimit_as:") {
|
||||
let (cur, _) = rest.split_once('/').expect("rlimit_as format");
|
||||
let cur: u64 = cur.parse().expect("numeric rlimit");
|
||||
// memory_mib=256 → cap = max(256*8, 4096) MiB = 4 GiB
|
||||
let four_gib = 4_u64 * 1024 * 1024 * 1024;
|
||||
assert_eq!(cur, four_gib, "RLIMIT_AS not 4 GiB: {cur}");
|
||||
return;
|
||||
}
|
||||
}
|
||||
panic!("rlimit_as line missing from stdout:\n{stdout}");
|
||||
}
|
||||
|
||||
/// `unshare(CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWNS)` is best-effort.
|
||||
/// On hosts that allow unprivileged user namespaces the probe's
|
||||
/// `/proc/self/ns/user` inode differs from the parent's; on locked-
|
||||
/// down hosts (sysctl `kernel.unprivileged_userns_clone=0`) the
|
||||
/// outcome decays to `Partial` instead of failing the run.
|
||||
#[test]
|
||||
fn unshare_namespaces_when_kernel_allows() {
|
||||
let Some(_) = probe_path() else { return };
|
||||
let tmp = workdir();
|
||||
let harness = build_harness_with_probe(tmp.path(), &[]);
|
||||
let opts = strict_opts();
|
||||
reset_last_hardening_outcome();
|
||||
let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run");
|
||||
let stdout = stdout_string(&result);
|
||||
let outcome = last_hardening_outcome().expect("hardening outcome recorded");
|
||||
|
||||
// Parent's user-ns inode for comparison.
|
||||
let parent_user_ns =
|
||||
std::fs::read_link("/proc/self/ns/user").map(|p| p.to_string_lossy().into_owned());
|
||||
|
||||
match outcome.unshare {
|
||||
PrimitiveStatus::Applied => {
|
||||
let probe_user_ns_line = stdout
|
||||
.lines()
|
||||
.find(|l| l.starts_with("ns_user:"))
|
||||
.expect("ns_user: line in stdout");
|
||||
if let Ok(parent) = parent_user_ns {
|
||||
assert!(
|
||||
!probe_user_ns_line.contains(parent.as_str()),
|
||||
"child user ns identical to parent — unshare reported Applied but ns inode unchanged"
|
||||
);
|
||||
}
|
||||
}
|
||||
PrimitiveStatus::Failed(errno) => {
|
||||
eprintln!(
|
||||
"unshare returned errno={errno} (likely unprivileged_userns_clone=0); \
|
||||
accepting Partial level"
|
||||
);
|
||||
assert!(matches!(
|
||||
outcome.level(),
|
||||
HardeningLevel::Partial | HardeningLevel::None
|
||||
));
|
||||
}
|
||||
PrimitiveStatus::Skipped => panic!("unshare must not be Skipped under Strict profile"),
|
||||
}
|
||||
}
|
||||
|
||||
/// `chroot` should make the host's `/etc/passwd` unreachable from
|
||||
/// inside the harness. Under the Strict profile and a static probe
|
||||
/// the file open returns ENOENT and the probe prints
|
||||
/// `chroot:blocked`.
|
||||
#[test]
|
||||
fn chroot_blocks_etc_passwd() {
|
||||
let Some(_) = probe_path() else { return };
|
||||
if !probe_is_static() {
|
||||
eprintln!("SKIP: probe is dynamically linked — chroot would block its loader before main()");
|
||||
return;
|
||||
}
|
||||
let tmp = workdir();
|
||||
let harness = build_harness_with_probe(tmp.path(), &[]);
|
||||
let opts = strict_opts();
|
||||
reset_last_hardening_outcome();
|
||||
let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run");
|
||||
let stdout = stdout_string(&result);
|
||||
let outcome = last_hardening_outcome().expect("hardening outcome recorded");
|
||||
|
||||
match outcome.chroot {
|
||||
PrimitiveStatus::Applied => {
|
||||
assert!(
|
||||
stdout.contains("chroot:blocked"),
|
||||
"chroot reported Applied but /etc/passwd was readable; full stdout:\n{stdout}"
|
||||
);
|
||||
}
|
||||
PrimitiveStatus::Failed(errno) => {
|
||||
// Common failure: EPERM when the kernel blocks chroot
|
||||
// for unprivileged callers without CAP_SYS_CHROOT, or
|
||||
// EINVAL when the workdir doesn't satisfy the
|
||||
// canonicalisation precondition. Accept Partial.
|
||||
eprintln!("chroot returned errno={errno}; recorded as Partial");
|
||||
assert_ne!(outcome.level(), HardeningLevel::Full);
|
||||
}
|
||||
PrimitiveStatus::Skipped => panic!("chroot must not be Skipped under Strict profile"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Path-traversal acceptance case from the phase deliverables.
|
||||
/// Drives the probe with `traverse` so it tries to open
|
||||
/// `/etc/passwd`; the binary exits non-zero on chroot success
|
||||
/// (mapped to `NotConfirmed` by the runner's exit-code rule) and
|
||||
/// prints `chroot blocked` for the test to assert on.
|
||||
#[test]
|
||||
fn path_traversal_returns_not_confirmed_when_chroot_holds() {
|
||||
let Some(_) = probe_path() else { return };
|
||||
if !probe_is_static() {
|
||||
eprintln!("SKIP: probe is dynamically linked — chroot test requires static link");
|
||||
return;
|
||||
}
|
||||
let tmp = workdir();
|
||||
let harness = build_harness_with_probe(tmp.path(), &["traverse"]);
|
||||
let opts = strict_opts();
|
||||
reset_last_hardening_outcome();
|
||||
let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run");
|
||||
let stdout = stdout_string(&result);
|
||||
let outcome = last_hardening_outcome().expect("hardening outcome recorded");
|
||||
|
||||
if matches!(outcome.chroot, PrimitiveStatus::Applied) {
|
||||
// NotConfirmed shape: the verifier maps a non-zero exit + no
|
||||
// sink-hit sentinel to NotConfirmed. We assert the two
|
||||
// structural pieces here directly.
|
||||
assert_eq!(
|
||||
result.exit_code,
|
||||
Some(7),
|
||||
"probe exit code mismatch — full stdout:\n{stdout}"
|
||||
);
|
||||
assert!(
|
||||
!result.sink_hit,
|
||||
"sink hit should be absent on a traversal-blocked run"
|
||||
);
|
||||
assert!(
|
||||
stdout.contains("chroot blocked") || stdout.contains("chroot:blocked")
|
||||
|| stdout.contains("traverse:blocked"),
|
||||
"expected `chroot blocked` marker in probe stdout; got:\n{stdout}"
|
||||
);
|
||||
} else {
|
||||
eprintln!(
|
||||
"SKIP: chroot did not apply (status={:?}); cannot assert traversal blocked",
|
||||
outcome.chroot,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// seccomp filter installs cleanly under the Strict profile and the
|
||||
/// probe survives long enough to print its sentinel. /proc/self/
|
||||
/// status's `Seccomp:` line transitions from `0` (disabled) to `2`
|
||||
/// (filter mode) when the prctl call succeeds.
|
||||
#[test]
|
||||
fn seccomp_filter_installed_under_strict() {
|
||||
let Some(_) = probe_path() else { return };
|
||||
let tmp = workdir();
|
||||
let harness = build_harness_with_probe(tmp.path(), &[]);
|
||||
let opts = strict_opts();
|
||||
reset_last_hardening_outcome();
|
||||
let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run");
|
||||
let stdout = stdout_string(&result);
|
||||
let outcome = last_hardening_outcome().expect("hardening outcome recorded");
|
||||
|
||||
match outcome.seccomp {
|
||||
PrimitiveStatus::Applied => {
|
||||
assert!(
|
||||
stdout.contains("Seccomp:\t2"),
|
||||
"Seccomp:2 missing — filter not active in /proc/self/status; stdout:\n{stdout}"
|
||||
);
|
||||
}
|
||||
PrimitiveStatus::Failed(errno) => {
|
||||
eprintln!(
|
||||
"SKIP: seccomp prctl returned errno={errno} (typical when running under \
|
||||
a sandbox that already locked the syscall down); accepting Partial level"
|
||||
);
|
||||
assert_ne!(outcome.level(), HardeningLevel::Full);
|
||||
}
|
||||
PrimitiveStatus::Skipped => panic!("seccomp must not be Skipped under Strict profile"),
|
||||
}
|
||||
}
|
||||
|
||||
/// Standard profile keeps the historical baseline: PR_SET_NO_NEW_PRIVS
|
||||
/// and RLIMIT_AS only. /etc/passwd should still be readable
|
||||
/// (no chroot) and the seccomp counter stays at 0.
|
||||
#[test]
|
||||
fn standard_profile_skips_chroot_and_seccomp() {
|
||||
let Some(_) = probe_path() else { return };
|
||||
let tmp = workdir();
|
||||
let harness = build_harness_with_probe(tmp.path(), &[]);
|
||||
let opts = standard_opts();
|
||||
reset_last_hardening_outcome();
|
||||
let result = sandbox::run(&harness, b"", &opts).expect("sandbox::run");
|
||||
let stdout = stdout_string(&result);
|
||||
let outcome = last_hardening_outcome().expect("hardening outcome recorded");
|
||||
|
||||
assert_eq!(outcome.level(), HardeningLevel::Baseline);
|
||||
assert!(matches!(outcome.no_new_privs, PrimitiveStatus::Applied));
|
||||
assert!(matches!(outcome.rlimit_as, PrimitiveStatus::Applied));
|
||||
// None of the strict-only primitives should have been attempted.
|
||||
assert!(matches!(outcome.chroot, PrimitiveStatus::Skipped));
|
||||
assert!(matches!(outcome.seccomp, PrimitiveStatus::Skipped));
|
||||
assert!(matches!(outcome.unshare, PrimitiveStatus::Skipped));
|
||||
|
||||
// Baseline: /etc/passwd should still be open-able from the host.
|
||||
// The probe prints either `chroot:blocked` (if outside the
|
||||
// sandbox restricted further) or `chroot:escaped`. We don't
|
||||
// require either: the assertion here is purely on the recorded
|
||||
// hardening outcome.
|
||||
let _ = stdout;
|
||||
let _ = result.exit_code;
|
||||
}
|
||||
|
||||
/// Seccomp policy synthesised from `seccomp_policy.toml` includes
|
||||
/// the syscalls required for the probe to reach `__NYX_PROBE_DONE__`
|
||||
/// (read, write, openat, readlinkat, fcntl, exit_group, …). This
|
||||
/// tests the codegen path without touching the kernel.
|
||||
#[test]
|
||||
fn seccomp_policy_includes_essential_syscalls() {
|
||||
let nrs = seccomp::allowed_syscall_numbers(0);
|
||||
for essential in &["read", "write", "close", "openat", "exit_group", "fstat"] {
|
||||
let nr = seccomp::syscalls::syscall_number(essential)
|
||||
.unwrap_or_else(|| panic!("syscall {essential} missing from per-arch table"));
|
||||
assert!(
|
||||
nrs.contains(&nr),
|
||||
"BASE seccomp allowlist missing essential syscall {essential} (nr={nr})"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Non-Linux placeholder so `cargo nextest run --test sandbox_hardening_linux`
|
||||
// doesn't fail with "no tests to run" on macOS / Windows CI rows. The real
|
||||
// suite gates every test on `target_os = "linux"`.
|
||||
#[cfg(not(all(feature = "dynamic", target_os = "linux")))]
|
||||
mod non_linux_placeholder {
|
||||
#[test]
|
||||
fn linux_only_suite_skipped_on_this_target() {
|
||||
eprintln!(
|
||||
"SKIP: tests/sandbox_hardening_linux.rs requires `--features dynamic` and \
|
||||
target_os = linux"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Add table
Add a link
Reference in a new issue