[pitboss] phase 17: Track E.1 — Linux process backend hardening

This commit is contained in:
pitboss 2026-05-15 09:44:20 -05:00
parent a4f890797a
commit dbad78fafa
10 changed files with 2414 additions and 68 deletions

View file

@ -29,6 +29,14 @@ use std::path::Path;
use std::sync::{Arc, OnceLock};
use std::time::{Duration, Instant};
#[cfg(target_os = "linux")]
pub mod process_linux;
#[cfg(target_os = "linux")]
pub mod seccomp;
#[cfg(target_os = "linux")]
pub use process_linux::{HardeningLevel, HardeningOutcome};
// ── Harness interpretation probe ──────────────────────────────────────────────
/// Returns true when the harness is driven by an interpreter (Python, Node, …)
@ -159,6 +167,40 @@ pub struct SandboxOptions {
/// into [`crate::dynamic::oracle::oracle_fired_with_stubs`].
/// `None` when the spec's `stubs_required` is empty.
pub stub_harness: Option<Arc<crate::dynamic::stubs::StubHarness>>,
/// Phase 17 (Track E.1): cap bits used to minimise the seccomp-bpf
/// allowlist applied to the Linux process backend. When `0`, the
/// process backend installs only the cap-independent `base` allowlist
/// from [`seccomp::seccomp_policy.toml`]; when non-zero, every cap bit
/// set adds its allowlisted syscalls on top. Other backends ignore
/// this field.
pub seccomp_caps: u32,
/// Phase 17 (Track E.1): hardening profile applied by the Linux
/// process backend. See [`ProcessHardeningProfile`] for the per-
/// variant primitive matrix.
pub process_hardening: ProcessHardeningProfile,
}
/// Phase 17 (Track E.1): selects which subset of the Linux process-
/// backend hardening primitives is applied.
///
/// - [`ProcessHardeningProfile::Standard`] — the historical baseline:
/// `prctl(PR_SET_NO_NEW_PRIVS)` + `setrlimit(RLIMIT_AS)` only. No
/// namespaces, no chroot, no seccomp. Default for back-compat.
/// - [`ProcessHardeningProfile::Strict`] — full Phase 17 sequence:
/// no-new-privs, all rlimits, namespace unshare, chroot to workdir,
/// default-deny seccomp filter scoped to [`SandboxOptions::seccomp_caps`].
/// Each primitive is best-effort; failures degrade to
/// [`HardeningLevel::Partial`] without aborting the run.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ProcessHardeningProfile {
Standard,
Strict,
}
impl Default for ProcessHardeningProfile {
fn default() -> Self {
ProcessHardeningProfile::Standard
}
}
impl SandboxOptions {
@ -186,6 +228,8 @@ impl Default for SandboxOptions {
probe_channel: None,
extra_env: Vec::new(),
stub_harness: None,
seccomp_caps: 0,
process_hardening: ProcessHardeningProfile::Standard,
}
}
}
@ -1207,25 +1251,35 @@ fn run_process(
cmd.env("NYX_PAYLOAD", std::ffi::OsStr::from_bytes(payload_bytes));
}
// Enforce memory cap before exec on Linux via RLIMIT_AS + PR_SET_NO_NEW_PRIVS.
// RLIMIT_AS limits total virtual address space. Python uses significantly
// more virtual AS than RSS (shared libs, mmap arenas), so the enforced
// limit is memory_mib * 8 with a floor of 4 GiB.
// Phase 17 (Track E.1): install the Linux process-backend hardening
// sequence — `prctl(PR_SET_NO_NEW_PRIVS)`, `setrlimit` (CPU/NOFILE/AS),
// `unshare(CLONE_NEWPID|CLONE_NEWNS|CLONE_NEWUSER)`, `chroot` to the
// workdir, and a default-deny seccomp-bpf filter scoped to
// `opts.seccomp_caps`. Each primitive is best-effort: failures
// downgrade to `HardeningLevel::Partial` instead of aborting the run.
#[cfg(target_os = "linux")]
{
use std::os::unix::process::CommandExt;
let memory_mib = opts.memory_mib;
// Safety: called in the child after fork but before exec; no allocator use.
unsafe {
cmd.pre_exec(move || {
rlimit_as_linux(memory_mib)?;
prctl_no_new_privs()
});
}
}
let collector = process_linux::install_pre_exec(&mut cmd, opts, &harness.workdir);
let start = Instant::now();
let mut child = cmd.spawn().map_err(SandboxError::Spawn)?;
let child_result = cmd.spawn();
#[cfg(target_os = "linux")]
let outcome_joiner;
let mut child = match child_result {
Ok(c) => {
#[cfg(target_os = "linux")]
{
outcome_joiner = collector.map(|c| c.after_spawn());
}
c
}
Err(e) => {
#[cfg(target_os = "linux")]
if let Some(c) = collector {
c.forget();
}
return Err(SandboxError::Spawn(e));
}
};
let timeout = opts.timeout;
let timed_out = std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false));
@ -1270,6 +1324,14 @@ fn run_process(
let status = child.wait().map_err(SandboxError::Io)?;
// Phase 17 (Track E.1): wait for the per-primitive HardeningOutcome
// drain thread before returning so callers (tests + telemetry) read
// a settled value via `process_linux::last_hardening_outcome()`.
#[cfg(target_os = "linux")]
if let Some(joiner) = outcome_joiner {
joiner.await_outcome();
}
let stdout_buf = stdout_handle
.and_then(|h| h.join().ok())
.and_then(|r| r.ok())
@ -1337,52 +1399,9 @@ fn base64_encode(data: &[u8]) -> String {
// ── Linux-specific syscall wrappers ──────────────────────────────────────────
/// Set RLIMIT_AS (virtual address space) in a `pre_exec` context on Linux.
///
/// `memory_mib` is the configured cap; we enforce `max(memory_mib * 8, 4096)`
/// MiB of virtual AS to give Python's mmap-heavy runtime adequate headroom
/// while still capping runaway memory bombs.
///
/// RLIMIT_AS = 9 on x86_64, aarch64, arm, ppc64, s390x, and all other major
/// Linux architectures (kernel source: include/uapi/asm-generic/resource.h).
#[cfg(target_os = "linux")]
fn rlimit_as_linux(memory_mib: u64) -> std::io::Result<()> {
#[repr(C)]
struct Rlimit {
cur: u64,
max: u64,
}
unsafe extern "C" {
fn setrlimit(resource: i32, rlim: *const Rlimit) -> i32;
}
const RLIMIT_AS: i32 = 9;
let cap_mib = memory_mib.saturating_mul(8).max(4096);
let bytes = cap_mib.saturating_mul(1024 * 1024);
let rl = Rlimit { cur: bytes, max: bytes };
let ret = unsafe { setrlimit(RLIMIT_AS, &rl) };
if ret == 0 {
Ok(())
} else {
Err(std::io::Error::last_os_error())
}
}
/// Set PR_SET_NO_NEW_PRIVS to 1 in a `pre_exec` context on Linux.
///
/// This prevents the child process from acquiring new privileges via setuid
/// binaries, file capabilities, or ptrace. Best-effort: silently succeeds
/// even if the prctl call fails (e.g., in restricted environments).
#[cfg(target_os = "linux")]
fn prctl_no_new_privs() -> std::io::Result<()> {
unsafe extern "C" {
fn prctl(option: i32, arg2: u64, arg3: u64, arg4: u64, arg5: u64) -> i32;
}
const PR_SET_NO_NEW_PRIVS: i32 = 38;
// Failure is non-fatal: some container runtimes block prctl but are
// themselves already sandboxed. Don't abort the child for this.
unsafe { prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
Ok(())
}
// `rlimit_as_linux`, `prctl_no_new_privs`, and the rest of the Linux process
// backend hardening sequence now live in [`process_linux`]. See
// [`process_linux::install_pre_exec`] for the call-site.
#[cfg(unix)]
fn libc_kill(pid: i32, sig: i32) -> i32 {

View file

@ -0,0 +1,657 @@
//! Phase 17 (Track E.1) — Linux process backend hardening.
//!
//! Owns the `pre_exec` sequence applied to every harness child started by
//! [`super::run_process`] on Linux:
//!
//! 1. `prctl(PR_SET_NO_NEW_PRIVS)` — block setuid / file-cap escalation.
//! 2. `setrlimit(RLIMIT_CPU)` — cap CPU time so a runaway payload exits.
//! 3. `setrlimit(RLIMIT_NOFILE)` — cap open fds; the harness receives only
//! a small number of stdio + probe fds from the parent.
//! 4. `setrlimit(RLIMIT_AS)` — cap virtual address space; multiplied by 8
//! with a 4 GiB floor so interpreted runtimes still start.
//! 5. `unshare(CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNS)` — drop the
//! host PID, mount, and user namespace views.
//! 6. `chroot(workdir)` + `chdir("/")` — isolate filesystem reach to the
//! harness workdir; payloads that try to read `/etc/passwd` see the
//! harness root, not the host one.
//! 7. seccomp-bpf default-deny filter scoped to the cap bits the spec
//! actually exercises (see [`super::seccomp`]).
//!
//! Each primitive is best-effort: failures are recorded into the per-
//! child [`HardeningOutcome`] file the parent reads back after exec, so
//! the verifier can downgrade to [`HardeningLevel::Partial`] without
//! aborting the harness run.
//!
//! The pre_exec callback runs in the child between fork(2) and execve(2)
//! — no Rust allocator use, no heap-borrowing closures. Anything the
//! parent needs to know is shipped through an `O_CLOEXEC` pipe the
//! parent owns the read end of: the child writes one [`HardeningOutcome`]
//! record into it, execve(2) drops the write end, and the parent's
//! drain thread sees EOF and records the outcome.
use crate::dynamic::sandbox::seccomp;
use crate::dynamic::sandbox::seccomp::bpf::SockFilter;
use crate::dynamic::sandbox::{ProcessHardeningProfile, SandboxOptions};
use std::io::Read;
use std::os::unix::io::{FromRawFd, RawFd};
use std::os::unix::process::CommandExt;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::sync::{Arc, Mutex, OnceLock};
// ── HardeningLevel reporting ─────────────────────────────────────────────────
/// Coarse summary of which Phase 17 primitives applied successfully.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum HardeningLevel {
/// Standard profile selected — only no-new-privs + RLIMIT_AS were
/// installed (no Phase 17 hardening attempted).
Baseline,
/// All requested primitives applied successfully.
Full,
/// At least one primitive failed (typically because the process is
/// already inside a sandbox that disallows e.g. `unshare`).
Partial,
/// Every primitive failed; the harness ran with no Phase 17
/// hardening at all.
None,
}
/// Per-primitive outcome captured by the child and read back by the
/// parent after `wait`.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct HardeningOutcome {
pub no_new_privs: PrimitiveStatus,
pub rlimit_cpu: PrimitiveStatus,
pub rlimit_nofile: PrimitiveStatus,
pub rlimit_as: PrimitiveStatus,
pub unshare: PrimitiveStatus,
pub chroot: PrimitiveStatus,
pub seccomp: PrimitiveStatus,
pub profile: ProcessHardeningProfileTag,
}
impl Default for HardeningOutcome {
fn default() -> Self {
Self {
no_new_privs: PrimitiveStatus::Skipped,
rlimit_cpu: PrimitiveStatus::Skipped,
rlimit_nofile: PrimitiveStatus::Skipped,
rlimit_as: PrimitiveStatus::Skipped,
unshare: PrimitiveStatus::Skipped,
chroot: PrimitiveStatus::Skipped,
seccomp: PrimitiveStatus::Skipped,
profile: ProcessHardeningProfileTag::Standard,
}
}
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub enum PrimitiveStatus {
/// Primitive was not requested by the active profile.
#[default]
Skipped,
/// Primitive applied successfully.
Applied,
/// Primitive call returned an error; raw errno is captured below.
Failed(i32),
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub enum ProcessHardeningProfileTag {
#[default]
Standard,
Strict,
}
impl HardeningOutcome {
/// Coarse summary used for the `HardeningLevel` column.
pub fn level(&self) -> HardeningLevel {
if matches!(self.profile, ProcessHardeningProfileTag::Standard) {
return HardeningLevel::Baseline;
}
let primitives = [
self.no_new_privs,
self.rlimit_cpu,
self.rlimit_nofile,
self.rlimit_as,
self.unshare,
self.chroot,
self.seccomp,
];
let applied = primitives.iter().filter(|s| matches!(s, PrimitiveStatus::Applied)).count();
let failed = primitives.iter().filter(|s| matches!(s, PrimitiveStatus::Failed(_))).count();
match (applied, failed) {
(_, 0) => HardeningLevel::Full,
(0, _) => HardeningLevel::None,
_ => HardeningLevel::Partial,
}
}
}
// ── Last outcome registry (read back by tests + telemetry) ───────────────────
static LAST_OUTCOME: OnceLock<Mutex<Option<HardeningOutcome>>> = OnceLock::new();
fn outcome_cell() -> &'static Mutex<Option<HardeningOutcome>> {
LAST_OUTCOME.get_or_init(|| Mutex::new(None))
}
fn record_outcome(outcome: HardeningOutcome) {
if let Ok(mut g) = outcome_cell().lock() {
*g = Some(outcome);
}
}
/// Snapshot of the most-recent hardening outcome. Returns `None` until
/// at least one [`install_pre_exec`] child has been spawned and waited
/// on. Tests + telemetry read this after `wait_for_outcome` to get the
/// per-primitive status table.
pub fn last_hardening_outcome() -> Option<HardeningOutcome> {
outcome_cell().lock().ok().and_then(|g| *g)
}
/// Reset the last-outcome slot. Tests use this between cases so a stale
/// value from a prior spawn cannot leak into the assertion under test.
pub fn reset_last_hardening_outcome() {
if let Ok(mut g) = outcome_cell().lock() {
*g = None;
}
}
// ── Status pipe between parent and child ─────────────────────────────────────
struct StatusPipe {
write_fd: RawFd,
read_fd: RawFd,
}
impl StatusPipe {
fn new() -> std::io::Result<Self> {
unsafe extern "C" {
fn pipe2(pipefd: *mut i32, flags: i32) -> i32;
}
const O_CLOEXEC: i32 = 0o2_000_000;
let mut fds = [-1_i32; 2];
let ret = unsafe { pipe2(fds.as_mut_ptr(), O_CLOEXEC) };
if ret != 0 {
return Err(std::io::Error::last_os_error());
}
Ok(Self { write_fd: fds[1], read_fd: fds[0] })
}
}
fn close_fd(fd: RawFd) {
unsafe extern "C" {
fn close(fd: i32) -> i32;
}
unsafe { close(fd) };
}
/// Drain `read_fd` into a `HardeningOutcome`. Wire format is the
/// 15-byte fixed-width record produced by [`encode_outcome`].
fn drain_outcome(read_fd: RawFd) -> Option<HardeningOutcome> {
let mut file = unsafe { std::fs::File::from_raw_fd(read_fd) };
let mut buf = Vec::with_capacity(64);
if file.read_to_end(&mut buf).is_err() {
return None;
}
decode_outcome(&buf)
}
const OUTCOME_LEN: usize = 1 + 7 * 2;
/// Decode a 15-byte hardening outcome record:
/// `[profile_tag, no_new_privs_tag, no_new_privs_errno_lo,
/// rlimit_cpu_tag, rlimit_cpu_errno_lo, ..., seccomp_tag, seccomp_errno_lo]`
/// All errnos are clamped to the low byte for the wire (true value is
/// recovered post-hoc from `errno`-symbolic context if needed).
fn decode_outcome(buf: &[u8]) -> Option<HardeningOutcome> {
if buf.len() < OUTCOME_LEN {
return None;
}
let profile = match buf[0] {
1 => ProcessHardeningProfileTag::Strict,
_ => ProcessHardeningProfileTag::Standard,
};
let mut idx = 1;
let mut next = || -> PrimitiveStatus {
let tag = buf[idx];
let errno = buf[idx + 1] as i32;
idx += 2;
match tag {
0 => PrimitiveStatus::Skipped,
1 => PrimitiveStatus::Applied,
_ => PrimitiveStatus::Failed(if errno == 0 { -1 } else { errno }),
}
};
let no_new_privs = next();
let rlimit_cpu = next();
let rlimit_nofile = next();
let rlimit_as = next();
let unshare = next();
let chroot = next();
let seccomp = next();
Some(HardeningOutcome {
no_new_privs,
rlimit_cpu,
rlimit_nofile,
rlimit_as,
unshare,
chroot,
seccomp,
profile,
})
}
fn encode_outcome(out: &HardeningOutcome) -> [u8; OUTCOME_LEN] {
let mut buf = [0_u8; OUTCOME_LEN];
buf[0] = match out.profile {
ProcessHardeningProfileTag::Standard => 0,
ProcessHardeningProfileTag::Strict => 1,
};
let mut idx = 1;
for status in [
out.no_new_privs,
out.rlimit_cpu,
out.rlimit_nofile,
out.rlimit_as,
out.unshare,
out.chroot,
out.seccomp,
] {
let (tag, errno) = match status {
PrimitiveStatus::Skipped => (0_u8, 0_u8),
PrimitiveStatus::Applied => (1_u8, 0_u8),
PrimitiveStatus::Failed(e) => (2_u8, (e.unsigned_abs() & 0xff) as u8),
};
buf[idx] = tag;
buf[idx + 1] = errno;
idx += 2;
}
buf
}
// ── Primitive wrappers (called from the child's pre_exec) ────────────────────
const RLIMIT_CPU: i32 = 0;
const RLIMIT_NOFILE: i32 = 7;
const RLIMIT_AS: i32 = 9;
const PR_SET_NO_NEW_PRIVS: i32 = 38;
const CLONE_NEWNS: i32 = 0x0002_0000;
const CLONE_NEWUSER: i32 = 0x1000_0000;
const CLONE_NEWPID: i32 = 0x2000_0000;
#[repr(C)]
struct Rlimit {
cur: u64,
max: u64,
}
unsafe extern "C" {
fn setrlimit(resource: i32, rlim: *const Rlimit) -> i32;
fn prctl(option: i32, arg2: u64, arg3: u64, arg4: u64, arg5: u64) -> i32;
fn unshare(flags: i32) -> i32;
fn chroot(path: *const i8) -> i32;
fn chdir(path: *const i8) -> i32;
fn write(fd: i32, buf: *const u8, count: usize) -> isize;
fn __errno_location() -> *mut i32;
}
fn last_errno() -> i32 {
unsafe { *__errno_location() }
}
fn apply_rlimit(resource: i32, bytes: u64) -> PrimitiveStatus {
let rl = Rlimit { cur: bytes, max: bytes };
let ret = unsafe { setrlimit(resource, &rl) };
if ret == 0 {
PrimitiveStatus::Applied
} else {
PrimitiveStatus::Failed(last_errno())
}
}
fn apply_no_new_privs() -> PrimitiveStatus {
let ret = unsafe { prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
if ret == 0 {
PrimitiveStatus::Applied
} else {
PrimitiveStatus::Failed(last_errno())
}
}
fn apply_unshare() -> PrimitiveStatus {
// CLONE_NEWUSER must come first on most modern kernels so the
// unprivileged caller can map uid/gid; CLONE_NEWPID + CLONE_NEWNS
// then succeed because the new user namespace owns them.
let flags = CLONE_NEWUSER | CLONE_NEWPID | CLONE_NEWNS;
let ret = unsafe { unshare(flags) };
if ret == 0 {
PrimitiveStatus::Applied
} else {
PrimitiveStatus::Failed(last_errno())
}
}
fn apply_chroot(workdir: &[u8]) -> PrimitiveStatus {
// `workdir` is NUL-terminated by `canonicalize_workdir` so we can
// hand the bytes straight to `chroot(2)` without allocating in
// pre_exec.
let ret = unsafe { chroot(workdir.as_ptr() as *const i8) };
if ret != 0 {
return PrimitiveStatus::Failed(last_errno());
}
let root = b"/\0";
let ret = unsafe { chdir(root.as_ptr() as *const i8) };
if ret != 0 {
return PrimitiveStatus::Failed(last_errno());
}
PrimitiveStatus::Applied
}
/// Install a pre-compiled seccomp BPF filter on the calling thread.
///
/// `program` is a heap-allocated BPF instruction array compiled in the
/// parent (`build_plan`) and shared via `Arc` so the child does not have
/// to allocate during pre_exec.
fn apply_seccomp(program: &[SockFilter]) -> PrimitiveStatus {
match seccomp::install_compiled_filter(program) {
Ok(()) => PrimitiveStatus::Applied,
Err(e) => PrimitiveStatus::Failed(e.raw_os_error().unwrap_or(-1)),
}
}
// ── Pre-exec installer ───────────────────────────────────────────────────────
#[derive(Clone)]
struct PreExecPlan {
rlimit_cpu_seconds: u64,
rlimit_nofile: u64,
rlimit_as_bytes: u64,
workdir_nul: Vec<u8>,
/// Pre-compiled BPF program for the requested cap-bits. Built in
/// the parent so the child's pre_exec callback never touches the
/// allocator.
seccomp_program: Arc<Vec<SockFilter>>,
profile: ProcessHardeningProfileTag,
}
/// Returned by [`install_pre_exec`]. The caller MUST invoke either
/// [`OutcomeCollector::after_spawn`] or [`OutcomeCollector::forget`]
/// after `cmd.spawn()` returns — the parent's write-fd has to close so
/// the read end sees EOF and the drain thread terminates.
pub struct OutcomeCollector {
write_fd: RawFd,
read_fd: RawFd,
}
/// Background-drain handle returned by [`OutcomeCollector::after_spawn`].
/// `run_process` awaits this after `child.wait()` so the outcome is
/// guaranteed to be in the registry before the function returns; tests
/// that bypass `run_process` can call [`OutcomeJoiner::await_outcome`]
/// themselves.
pub struct OutcomeJoiner {
handle: Option<std::thread::JoinHandle<()>>,
}
impl OutcomeJoiner {
/// Block until the drain thread finishes recording the outcome.
pub fn await_outcome(mut self) {
if let Some(h) = self.handle.take() {
let _ = h.join();
}
}
}
impl Drop for OutcomeJoiner {
fn drop(&mut self) {
if let Some(h) = self.handle.take() {
let _ = h.join();
}
}
}
impl OutcomeCollector {
/// Call after `cmd.spawn()` returns `Ok`. Closes the parent's copy
/// of the write fd so the kernel ref-count drops to whatever the
/// child is still holding; once execve(2) closes the child's
/// O_CLOEXEC copy too, the read end sees EOF and the drain thread
/// records the outcome via [`record_outcome`]. Returns a join
/// handle the caller can await to know the outcome is settled.
pub fn after_spawn(self) -> OutcomeJoiner {
close_fd(self.write_fd);
let read_fd = self.read_fd;
let handle = std::thread::spawn(move || {
if let Some(outcome) = drain_outcome(read_fd) {
record_outcome(outcome);
}
});
OutcomeJoiner { handle: Some(handle) }
}
/// Call when `cmd.spawn()` failed. Closes both ends so neither fd
/// leaks; no outcome is recorded.
pub fn forget(self) {
close_fd(self.write_fd);
close_fd(self.read_fd);
}
}
/// Install the Phase 17 hardening sequence on `cmd`.
///
/// Returns `Some(collector)` when the status pipe was successfully
/// created; the caller must invoke
/// [`OutcomeCollector::after_spawn`] after a successful `cmd.spawn()`.
/// Returns `None` when pipe creation itself failed (rare:
/// `EMFILE`/`ENFILE`). In that case the pre_exec hook is still
/// installed — the child still gets the full hardening sequence — but
/// the per-primitive outcome cannot be reported back to the parent.
pub fn install_pre_exec(
cmd: &mut Command,
opts: &SandboxOptions,
workdir: &Path,
) -> Option<OutcomeCollector> {
let plan = build_plan(opts, workdir);
let pipe = StatusPipe::new().ok();
let write_fd = pipe.as_ref().map(|p| p.write_fd).unwrap_or(-1);
let read_fd = pipe.as_ref().map(|p| p.read_fd);
let plan_for_child = plan.clone();
// Safety: pre_exec runs after fork(2) and before execve(2). We must
// not allocate, take any locks, or call into the Rust runtime. The
// captured `plan_for_child` is moved in; reading its already-allocated
// fields is safe because no allocator call is needed.
unsafe {
cmd.pre_exec(move || {
let outcome = run_pre_exec_in_child(&plan_for_child);
if write_fd >= 0 {
let bytes = encode_outcome(&outcome);
let _ = write(write_fd, bytes.as_ptr(), bytes.len());
// execve(2) closes write_fd via O_CLOEXEC; no manual
// close needed here.
}
Ok(())
});
}
read_fd.map(|read_fd| OutcomeCollector { write_fd, read_fd })
}
fn run_pre_exec_in_child(plan: &PreExecPlan) -> HardeningOutcome {
let mut outcome = HardeningOutcome::default();
outcome.profile = plan.profile;
// ── Always-on: PR_SET_NO_NEW_PRIVS + RLIMIT_AS ───────────────────────
outcome.no_new_privs = apply_no_new_privs();
outcome.rlimit_as = apply_rlimit(RLIMIT_AS, plan.rlimit_as_bytes);
if matches!(plan.profile, ProcessHardeningProfileTag::Standard) {
return outcome;
}
// ── Strict profile: rlimits, unshare, chroot, seccomp ────────────────
outcome.rlimit_cpu = apply_rlimit(RLIMIT_CPU, plan.rlimit_cpu_seconds);
outcome.rlimit_nofile = apply_rlimit(RLIMIT_NOFILE, plan.rlimit_nofile);
outcome.unshare = apply_unshare();
outcome.chroot = apply_chroot(&plan.workdir_nul);
// seccomp is applied last so the filter does not block any of the
// earlier syscalls (setrlimit, prctl, unshare, chroot, chdir).
outcome.seccomp = apply_seccomp(plan.seccomp_program.as_slice());
outcome
}
fn build_plan(opts: &SandboxOptions, workdir: &Path) -> PreExecPlan {
let memory_mib = opts.memory_mib;
let cap_mib = memory_mib.saturating_mul(8).max(4096);
let rlimit_as_bytes = cap_mib.saturating_mul(1024 * 1024);
let timeout_secs = opts.timeout.as_secs().max(1);
let rlimit_cpu_seconds = timeout_secs.saturating_mul(2).max(2);
let workdir_nul = canonicalize_workdir(workdir);
// Pre-compile the BPF program in the parent so the pre_exec
// callback (which must not allocate) can hand it straight to
// `prctl(PR_SET_SECCOMP)`.
let nrs = seccomp::allowed_syscall_numbers(opts.seccomp_caps);
let program = seccomp::bpf::compile(&nrs, seccomp::syscalls::AUDIT_ARCH);
PreExecPlan {
rlimit_cpu_seconds,
rlimit_nofile: 256,
rlimit_as_bytes,
workdir_nul,
seccomp_program: Arc::new(program),
profile: match opts.process_hardening {
ProcessHardeningProfile::Standard => ProcessHardeningProfileTag::Standard,
ProcessHardeningProfile::Strict => ProcessHardeningProfileTag::Strict,
},
}
}
fn canonicalize_workdir(workdir: &Path) -> Vec<u8> {
let canonical: PathBuf = std::fs::canonicalize(workdir).unwrap_or_else(|_| workdir.to_path_buf());
let mut bytes = canonical.into_os_string().into_encoded_bytes();
if !bytes.ends_with(&[0]) {
bytes.push(0);
}
bytes
}
// ── Tests ────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn outcome_codec_round_trip_strict_full() {
let out = HardeningOutcome {
no_new_privs: PrimitiveStatus::Applied,
rlimit_cpu: PrimitiveStatus::Applied,
rlimit_nofile: PrimitiveStatus::Applied,
rlimit_as: PrimitiveStatus::Applied,
unshare: PrimitiveStatus::Applied,
chroot: PrimitiveStatus::Applied,
seccomp: PrimitiveStatus::Applied,
profile: ProcessHardeningProfileTag::Strict,
};
let bytes = encode_outcome(&out);
let decoded = decode_outcome(&bytes).expect("decode");
assert_eq!(decoded, out);
assert_eq!(decoded.level(), HardeningLevel::Full);
}
#[test]
fn outcome_codec_round_trip_partial() {
let out = HardeningOutcome {
no_new_privs: PrimitiveStatus::Applied,
rlimit_cpu: PrimitiveStatus::Applied,
rlimit_nofile: PrimitiveStatus::Failed(13),
rlimit_as: PrimitiveStatus::Applied,
unshare: PrimitiveStatus::Failed(1),
chroot: PrimitiveStatus::Failed(13),
seccomp: PrimitiveStatus::Applied,
profile: ProcessHardeningProfileTag::Strict,
};
let bytes = encode_outcome(&out);
let decoded = decode_outcome(&bytes).expect("decode");
assert_eq!(decoded, out);
assert_eq!(decoded.level(), HardeningLevel::Partial);
}
#[test]
fn standard_profile_reports_baseline_level() {
let out = HardeningOutcome {
no_new_privs: PrimitiveStatus::Applied,
rlimit_as: PrimitiveStatus::Applied,
profile: ProcessHardeningProfileTag::Standard,
..HardeningOutcome::default()
};
assert_eq!(out.level(), HardeningLevel::Baseline);
}
#[test]
fn build_plan_pads_workdir_with_nul() {
let opts = SandboxOptions::default();
let plan = build_plan(&opts, std::path::Path::new("/tmp"));
assert!(plan.workdir_nul.ends_with(&[0]));
assert_eq!(plan.profile, ProcessHardeningProfileTag::Standard);
}
#[test]
fn build_plan_strict_compiles_seccomp_program() {
let opts = SandboxOptions {
seccomp_caps: 0xff,
process_hardening: ProcessHardeningProfile::Strict,
..SandboxOptions::default()
};
let plan = build_plan(&opts, std::path::Path::new("/tmp"));
// The arch check + ld nr + KILL + ALLOW alone are 5 instructions;
// the BASE allowlist adds dozens more.
assert!(plan.seccomp_program.len() > 5, "BPF program too small: {}", plan.seccomp_program.len());
assert_eq!(plan.profile, ProcessHardeningProfileTag::Strict);
}
#[test]
fn rlimit_as_bytes_floors_at_4_gib() {
let opts = SandboxOptions { memory_mib: 1, ..SandboxOptions::default() };
let plan = build_plan(&opts, std::path::Path::new("/tmp"));
assert_eq!(plan.rlimit_as_bytes, 4096_u64 * 1024 * 1024);
}
#[test]
fn rlimit_as_bytes_scales_with_memory_mib() {
let opts = SandboxOptions { memory_mib: 1024, ..SandboxOptions::default() };
let plan = build_plan(&opts, std::path::Path::new("/tmp"));
// 1024 MiB * 8 = 8192 MiB
assert_eq!(plan.rlimit_as_bytes, 8192_u64 * 1024 * 1024);
}
#[test]
fn truncated_buffer_decodes_to_none() {
assert!(decode_outcome(&[]).is_none());
assert!(decode_outcome(&[0_u8; OUTCOME_LEN - 1]).is_none());
}
#[test]
fn record_and_reset_round_trip() {
let original = last_hardening_outcome();
let probe = HardeningOutcome {
no_new_privs: PrimitiveStatus::Applied,
profile: ProcessHardeningProfileTag::Strict,
..HardeningOutcome::default()
};
record_outcome(probe);
assert_eq!(last_hardening_outcome(), Some(probe));
reset_last_hardening_outcome();
assert!(last_hardening_outcome().is_none());
if let Some(prev) = original {
record_outcome(prev);
}
}
}

View file

@ -0,0 +1,173 @@
//! Hand-rolled BPF program emitter for seccomp filters.
//!
//! BPF instruction format from `<linux/filter.h>`:
//!
//! ```text
//! struct sock_filter { u16 code; u8 jt; u8 jf; u32 k; }
//! ```
//!
//! Only the ops Nyx needs to implement an AUDIT_ARCH check + per-syscall
//! allowlist are defined. The output array is fed straight into
//! `prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &program)`.
#[repr(C)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct SockFilter {
pub code: u16,
pub jt: u8,
pub jf: u8,
pub k: u32,
}
#[repr(C)]
pub struct SockFprog {
pub len: u16,
pub filter: *const SockFilter,
}
// BPF opcode constants — see `linux/bpf_common.h`.
pub const BPF_LD: u16 = 0x00;
pub const BPF_W: u16 = 0x00;
pub const BPF_ABS: u16 = 0x20;
pub const BPF_JMP: u16 = 0x05;
pub const BPF_JEQ: u16 = 0x10;
pub const BPF_K: u16 = 0x00;
pub const BPF_RET: u16 = 0x06;
// seccomp action constants — see `linux/seccomp.h`.
pub const SECCOMP_RET_KILL_PROCESS: u32 = 0x8000_0000;
pub const SECCOMP_RET_KILL: u32 = 0x0000_0000;
pub const SECCOMP_RET_ALLOW: u32 = 0x7fff_0000;
pub const SECCOMP_RET_ERRNO: u32 = 0x0005_0000;
// Offsets into `struct seccomp_data` from `linux/seccomp.h`:
// nr (s32) at offset 0
// arch (u32) at offset 4
pub const SECCOMP_DATA_NR: u32 = 0;
pub const SECCOMP_DATA_ARCH: u32 = 4;
/// Emit a BPF program implementing:
///
/// 1. Load `arch` from `seccomp_data`; if it does not match
/// `audit_arch`, kill the process.
/// 2. Load `nr` from `seccomp_data`.
/// 3. For each `allowed_nr` in the table, jump to the ALLOW return.
/// 4. Default: return KILL_PROCESS (or KILL on older kernels).
///
/// The instruction count is `5 + allowed_nrs.len()` (plus one for the
/// final ALLOW return). Linux caps seccomp programs at 4096
/// instructions; the realistic cap-per-finding allowlist is well under
/// 100.
pub fn compile(allowed_nrs: &[u32], audit_arch: u32) -> Vec<SockFilter> {
let mut program: Vec<SockFilter> = Vec::with_capacity(allowed_nrs.len() + 8);
// (0) ld [arch]
program.push(SockFilter {
code: BPF_LD | BPF_W | BPF_ABS,
jt: 0,
jf: 0,
k: SECCOMP_DATA_ARCH,
});
// (1) jeq audit_arch ? next : KILL
// KILL is at the very end; computed below after we know the size.
let arch_check_idx = program.len();
program.push(SockFilter { code: BPF_JMP | BPF_JEQ | BPF_K, jt: 0, jf: 0, k: audit_arch });
// (2) ld [nr]
program.push(SockFilter {
code: BPF_LD | BPF_W | BPF_ABS,
jt: 0,
jf: 0,
k: SECCOMP_DATA_NR,
});
// (3..N) per-syscall jeq nr ? ALLOW : next
// ALLOW is two instructions before KILL (we lay out:
// ... checks ...
// ret KILL
// ret ALLOW
// ). Each jeq jumps `(N - i - 1) + 1` (over the remaining checks
// plus the KILL ret) to land on the ALLOW ret. Computed below.
let first_check_idx = program.len();
for &nr in allowed_nrs {
program.push(SockFilter { code: BPF_JMP | BPF_JEQ | BPF_K, jt: 0, jf: 0, k: nr });
}
// (KILL) ret KILL_PROCESS
let kill_idx = program.len();
program.push(SockFilter {
code: BPF_RET | BPF_K,
jt: 0,
jf: 0,
k: SECCOMP_RET_KILL_PROCESS,
});
// (ALLOW) ret ALLOW
let allow_idx = program.len();
program.push(SockFilter { code: BPF_RET | BPF_K, jt: 0, jf: 0, k: SECCOMP_RET_ALLOW });
// Patch arch check: jt=0 (next on match), jf=N (KILL on mismatch).
let arch_jf = (kill_idx - arch_check_idx - 1) as u8;
program[arch_check_idx].jf = arch_jf;
// Patch each per-syscall jeq: jt = jump to ALLOW, jf = fall through.
for (i, nr_idx) in (first_check_idx..first_check_idx + allowed_nrs.len()).enumerate() {
let _ = i;
let jt = (allow_idx - nr_idx - 1) as u8;
program[nr_idx].jt = jt;
}
program
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_allowlist_emits_arch_check_and_kill() {
let prog = compile(&[], 0xc000_003e);
// ld arch, jeq audit_arch, ld nr, ret KILL, ret ALLOW
assert_eq!(prog.len(), 5);
assert_eq!(prog[0].k, SECCOMP_DATA_ARCH);
assert_eq!(prog[1].k, 0xc000_003e);
assert_eq!(prog[2].k, SECCOMP_DATA_NR);
assert_eq!(prog[3].k, SECCOMP_RET_KILL_PROCESS);
assert_eq!(prog[4].k, SECCOMP_RET_ALLOW);
}
#[test]
fn single_syscall_allows_its_nr() {
let prog = compile(&[42], 0xc000_003e);
// ld arch, jeq audit_arch, ld nr, jeq 42, ret KILL, ret ALLOW
assert_eq!(prog.len(), 6);
let jeq = prog[3];
assert_eq!(jeq.code, BPF_JMP | BPF_JEQ | BPF_K);
assert_eq!(jeq.k, 42);
// jt jumps over the KILL ret (1 inst) to land on ALLOW
assert_eq!(jeq.jt, 1);
assert_eq!(prog[4].k, SECCOMP_RET_KILL_PROCESS);
assert_eq!(prog[5].k, SECCOMP_RET_ALLOW);
}
#[test]
fn multi_syscall_jt_offsets_chain_to_allow() {
let prog = compile(&[1, 2, 3], 0xc000_003e);
// ld arch, jeq audit_arch, ld nr, jeq 1, jeq 2, jeq 3, KILL, ALLOW
assert_eq!(prog.len(), 8);
// jeq 1 at idx 3 → ALLOW at idx 7 → jt=7-3-1=3
assert_eq!(prog[3].jt, 3);
// jeq 2 at idx 4 → jt=7-4-1=2
assert_eq!(prog[4].jt, 2);
// jeq 3 at idx 5 → jt=7-5-1=1
assert_eq!(prog[5].jt, 1);
}
#[test]
fn arch_mismatch_jumps_to_kill() {
let prog = compile(&[1, 2], 0xc000_003e);
// ld arch (0), jeq arch (1), ld nr (2), jeq 1 (3), jeq 2 (4), KILL (5), ALLOW (6)
// arch jeq jf must point to KILL → jf=5-1-1=3
assert_eq!(prog[1].jf, 3);
assert_eq!(prog[5].k, SECCOMP_RET_KILL_PROCESS);
}
}

View file

@ -0,0 +1,179 @@
//! Phase 17 (Track E.1) — seccomp-bpf default-deny filter.
//!
//! [`apply_for_caps`] composes the cap-tagged allowlist baked from
//! `seccomp_policy.toml` (via `build.rs`) into a BPF program and installs
//! it via `prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &program)`. The
//! filter is per-thread and inherited across `execve`, so the harness
//! runs under it from the very first instruction of its image.
//!
//! Layout
//! ------
//! - `seccomp_policy.toml` — declarative cap → syscall table (the source
//! of truth). `build.rs` parses it and emits an inline-includable Rust
//! table to `OUT_DIR/seccomp_policy.rs`.
//! - `bpf.rs` — minimal BPF instruction emitter (`compile()` returns a
//! `Vec<SockFilter>`).
//! - `syscalls.rs` — name → number map, x86_64 / aarch64.
//!
//! Design choices
//! --------------
//! - Default action is `SECCOMP_RET_KILL_PROCESS` so a denied syscall
//! takes the whole harness down (loud failure, easy to tell apart from
//! a normal sink hit).
//! - Unknown syscall names from the policy are silently dropped — they
//! can't be filtered without a number, and any kernel that recognises
//! the name has the number too. Tests assert the policy round-trips.
pub mod bpf;
pub mod syscalls;
use std::collections::BTreeSet;
use crate::dynamic::sandbox::seccomp::bpf::{compile, SockFilter, SockFprog};
use crate::dynamic::sandbox::seccomp::syscalls::{syscall_number, AUDIT_ARCH};
include!(concat!(env!("OUT_DIR"), "/seccomp_policy.rs"));
const PR_SET_NO_NEW_PRIVS: i32 = 38;
const PR_SET_SECCOMP: i32 = 22;
const SECCOMP_MODE_FILTER: u64 = 2;
unsafe extern "C" {
fn prctl(option: i32, arg2: u64, arg3: u64, arg4: u64, arg5: u64) -> i32;
fn __errno_location() -> *mut i32;
}
/// Compose the cap-aware syscall allowlist: the `BASE` set unconditionally
/// + every `CAP[i]` whose bit is set in `caps`. Names are deduped via a
/// `BTreeSet` and resolved to numbers via [`syscall_number`]. Unknown
/// names (not in the per-arch table) are silently dropped.
pub fn allowed_syscall_numbers(caps: u32) -> Vec<u32> {
let mut names: BTreeSet<&'static str> = BTreeSet::new();
for &n in BASE.iter() {
names.insert(n);
}
for &(bit, allowlist) in CAP.iter() {
if caps & bit != 0 {
for &n in allowlist.iter() {
names.insert(n);
}
}
}
let mut nrs: Vec<u32> = names.into_iter().filter_map(syscall_number).collect();
nrs.sort_unstable();
nrs.dedup();
nrs
}
/// Install a pre-compiled seccomp filter on the calling thread.
///
/// `program` MUST come from [`bpf::compile`]. Calls
/// `prctl(PR_SET_NO_NEW_PRIVS)` first (a kernel prerequisite for
/// unprivileged seccomp filter install) then
/// `prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog)`. Returns the
/// underlying `io::Error` on failure.
///
/// Allocator-free: the function only borrows `program`, so the
/// hardening pre_exec callback can use it without violating the
/// post-fork allocator ban.
pub fn install_compiled_filter(program: &[SockFilter]) -> std::io::Result<()> {
if AUDIT_ARCH == 0 || program.is_empty() {
return Ok(());
}
// PR_SET_NO_NEW_PRIVS = 1 is a kernel prerequisite for unprivileged
// seccomp filter install. The Phase 17 hardening sequence already
// calls it earlier, but installing here too is idempotent and
// protects direct callers.
let _ = unsafe { prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
let prog = SockFprog {
len: program.len() as u16,
filter: program.as_ptr(),
};
let ret = unsafe {
prctl(
PR_SET_SECCOMP,
SECCOMP_MODE_FILTER,
&prog as *const SockFprog as u64,
0,
0,
)
};
if ret == 0 {
Ok(())
} else {
Err(std::io::Error::from_raw_os_error(unsafe {
*__errno_location()
}))
}
}
/// Convenience wrapper: compose the cap-aware allowlist via
/// [`allowed_syscall_numbers`], compile a BPF program, and install it.
/// Used by direct callers that don't pre-compile in the parent.
pub fn apply_for_caps(caps: u32) -> std::io::Result<()> {
if AUDIT_ARCH == 0 {
return Ok(());
}
let nrs = allowed_syscall_numbers(caps);
let program: Vec<SockFilter> = compile(&nrs, AUDIT_ARCH);
install_compiled_filter(&program)
}
// ── Tests ────────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn base_table_is_non_empty() {
assert!(!BASE.is_empty(), "seccomp BASE allowlist must include stdio + startup syscalls");
}
#[test]
fn cap_table_includes_known_caps() {
let known: Vec<&str> = CAP
.iter()
.map(|(_, _)| "_")
.collect();
// We declared SQL_QUERY, FILE_IO, SSRF, CODE_EXEC, HTML_ESCAPE,
// DESERIALIZE, HEADER_INJECTION, OPEN_REDIRECT in the toml; the
// build script emits one entry per `[cap.X]` table. The exact
// count can grow as the policy grows; assert ≥ 4 so a future
// accidental empty-policy regression is loud.
assert!(known.len() >= 4, "CAP table emitted: {:?}", known.len());
}
#[test]
fn allowlist_deduplicates_overlapping_caps() {
// SSRF and HEADER_INJECTION both allow `socket`; the deduped set
// must contain it exactly once.
let nrs = allowed_syscall_numbers(0);
let mut sorted = nrs.clone();
sorted.sort_unstable();
sorted.dedup();
assert_eq!(nrs.len(), sorted.len());
}
#[test]
fn caps_zero_returns_only_base() {
let base = allowed_syscall_numbers(0);
let with_caps = allowed_syscall_numbers(0xffff_ffff);
assert!(base.len() <= with_caps.len());
}
/// `BASE` includes `read` / `write` / `close` — the minimum the
/// harness needs to print to stdout and exit cleanly.
#[test]
fn base_allows_stdio() {
let nrs = allowed_syscall_numbers(0);
let read = syscall_number("read").expect("read in syscall map");
let write = syscall_number("write").expect("write in syscall map");
let close = syscall_number("close").expect("close in syscall map");
assert!(nrs.contains(&read));
assert!(nrs.contains(&write));
assert!(nrs.contains(&close));
}
}

View file

@ -0,0 +1,216 @@
# Phase 17 (Track E.1) — seccomp-bpf default-deny allowlist.
#
# Format
# ------
# Each `[base]` syscall is allowed unconditionally (every harness needs
# them for stdio + interpreter / runtime startup). Each `[cap.<NAME>]`
# table adds syscalls allowed only when that `Cap` bit is set in
# `SandboxOptions::seccomp_caps`. Unknown / unset caps fall back to the
# base list, so a finding with no cap-aware needs runs with the strictest
# possible filter.
#
# `<NAME>` must match a `Cap::*` const declared in `src/labels/mod.rs`.
# The list of known names is mirrored in `build.rs::CAP_BIT_FOR_NAME`;
# add the bit value alongside the const when extending [`Cap`].
#
# Build-time codegen
# ------------------
# `build.rs` reads this file and emits `OUT_DIR/seccomp_policy.rs`
# containing two `&'static [&'static str]` tables (`BASE` + `CAP`).
# Runtime then maps the syscall names to x86_64 / aarch64 numbers via
# `syscalls.rs` and compiles a BPF program per cap-bits.
[base]
allow = [
"read",
"write",
"writev",
"readv",
"close",
"fstat",
"lseek",
"lstat",
"stat",
"newfstatat",
"statx",
"mmap",
"mremap",
"munmap",
"brk",
"rt_sigaction",
"rt_sigreturn",
"rt_sigprocmask",
"sigaltstack",
"exit",
"exit_group",
"futex",
"set_robust_list",
"get_robust_list",
"getrandom",
"getpid",
"gettid",
"getuid",
"geteuid",
"getgid",
"getegid",
"clock_gettime",
"clock_getres",
"clock_nanosleep",
"nanosleep",
"ioctl",
"fcntl",
"dup",
"dup2",
"dup3",
"pipe",
"pipe2",
"uname",
"arch_prctl",
"prlimit64",
"getrlimit",
"set_tid_address",
"rseq",
"madvise",
"mprotect",
"epoll_create1",
"epoll_ctl",
"epoll_wait",
"epoll_pwait",
"poll",
"ppoll",
"select",
"pselect6",
"wait4",
"waitid",
"tgkill",
"kill",
"openat",
"open",
"access",
"faccessat",
"faccessat2",
"readlink",
"readlinkat",
"getcwd",
"getdents",
"getdents64",
"sched_getaffinity",
"sched_setaffinity",
"sched_yield",
"prctl",
"membarrier",
]
[cap.SQL_QUERY]
# SQLite / driver paths use lock + truncate + sync ops on top of the base
# openat / read / write set.
allow = [
"fdatasync",
"fsync",
"fallocate",
"ftruncate",
"flock",
"pread64",
"pwrite64",
]
[cap.FILE_IO]
# File reads + directory walks need the dirfd / xattr / link family on
# top of the base set.
allow = [
"pread64",
"pwrite64",
"readlinkat",
"linkat",
"symlinkat",
"unlinkat",
"mkdirat",
"renameat",
"renameat2",
"utimensat",
"fchmod",
"fchown",
"fchmodat",
"fchownat",
"getxattr",
"fgetxattr",
"lgetxattr",
"listxattr",
"flistxattr",
"llistxattr",
"copy_file_range",
"sendfile",
]
[cap.SSRF]
# Outbound HTTP needs the socket / connect / TLS handshake set.
allow = [
"socket",
"connect",
"sendto",
"recvfrom",
"sendmsg",
"recvmsg",
"shutdown",
"getsockname",
"getpeername",
"getsockopt",
"setsockopt",
"bind",
"listen",
"accept",
"accept4",
]
[cap.CODE_EXEC]
# `subprocess.run(...)` / `os.system(...)` payloads need fork + exec.
allow = [
"clone",
"clone3",
"fork",
"vfork",
"execve",
"execveat",
"wait4",
"waitid",
]
[cap.HTML_ESCAPE]
# Pure-CPU sanitizer paths need only the base set; this entry exists so
# the build-time codegen sees the cap and emits an explicit table even
# when the allowlist is empty.
allow = []
[cap.DESERIALIZE]
# pickle / Marshal / unserialize paths typically only need the base I/O
# set; codegen-only entry.
allow = []
[cap.HEADER_INJECTION]
# CRLF-sensitive header sinks share the SSRF socket family.
allow = [
"socket",
"connect",
"sendto",
"recvfrom",
"sendmsg",
"recvmsg",
"getsockname",
"getpeername",
"getsockopt",
"setsockopt",
]
[cap.OPEN_REDIRECT]
allow = [
"socket",
"connect",
"sendto",
"recvfrom",
"sendmsg",
"recvmsg",
"getsockname",
"getpeername",
"getsockopt",
"setsockopt",
]

View file

@ -0,0 +1,291 @@
//! Syscall name → number map for the architectures Nyx's Linux process
//! backend supports. Only the names referenced by
//! `seccomp_policy.toml` need to be present; unknown names are silently
//! dropped from the BPF allowlist (they cannot be filtered if they have
//! no number).
//!
//! Numbers are pulled from `<asm/unistd_64.h>` (x86_64) and
//! `<asm-generic/unistd.h>` (aarch64). When a syscall exists on one
//! arch but not the other (e.g. `arch_prctl` on aarch64), the entry is
//! omitted on the missing arch and the seccomp filter naturally falls
//! through to the deny rule there.
#[cfg(target_arch = "x86_64")]
pub fn syscall_number(name: &str) -> Option<u32> {
let n = match name {
"read" => 0,
"write" => 1,
"open" => 2,
"close" => 3,
"stat" => 4,
"fstat" => 5,
"lstat" => 6,
"poll" => 7,
"lseek" => 8,
"mmap" => 9,
"mprotect" => 10,
"munmap" => 11,
"brk" => 12,
"rt_sigaction" => 13,
"rt_sigprocmask" => 14,
"rt_sigreturn" => 15,
"ioctl" => 16,
"pread64" => 17,
"pwrite64" => 18,
"readv" => 19,
"writev" => 20,
"access" => 21,
"pipe" => 22,
"select" => 23,
"sched_yield" => 24,
"mremap" => 25,
"madvise" => 28,
"dup" => 32,
"dup2" => 33,
"nanosleep" => 35,
"getpid" => 39,
"sendfile" => 40,
"socket" => 41,
"connect" => 42,
"accept" => 43,
"sendto" => 44,
"recvfrom" => 45,
"sendmsg" => 46,
"recvmsg" => 47,
"shutdown" => 48,
"bind" => 49,
"listen" => 50,
"getsockname" => 51,
"getpeername" => 52,
"setsockopt" => 54,
"getsockopt" => 55,
"clone" => 56,
"fork" => 57,
"vfork" => 58,
"execve" => 59,
"exit" => 60,
"wait4" => 61,
"kill" => 62,
"uname" => 63,
"fcntl" => 72,
"flock" => 73,
"fsync" => 74,
"fdatasync" => 75,
"ftruncate" => 77,
"getdents" => 78,
"getcwd" => 79,
"readlink" => 89,
"fchmod" => 91,
"fchown" => 93,
"getuid" => 102,
"getgid" => 104,
"geteuid" => 107,
"getegid" => 108,
"sigaltstack" => 131,
"arch_prctl" => 158,
"gettid" => 186,
"futex" => 202,
"sched_setaffinity" => 203,
"sched_getaffinity" => 204,
"epoll_create" => 213,
"getdents64" => 217,
"set_tid_address" => 218,
"fadvise64" => 221,
"clock_gettime" => 228,
"clock_getres" => 229,
"clock_nanosleep" => 230,
"exit_group" => 231,
"epoll_wait" => 232,
"epoll_ctl" => 233,
"tgkill" => 234,
"waitid" => 247,
"openat" => 257,
"mkdirat" => 258,
"newfstatat" => 262,
"unlinkat" => 263,
"renameat" => 264,
"linkat" => 265,
"symlinkat" => 266,
"readlinkat" => 267,
"fchmodat" => 268,
"faccessat" => 269,
"pselect6" => 270,
"ppoll" => 271,
"fallocate" => 285,
"utimensat" => 280,
"epoll_pwait" => 281,
"accept4" => 288,
"pipe2" => 293,
"epoll_create1" => 291,
"dup3" => 292,
"prlimit64" => 302,
"getrandom" => 318,
"membarrier" => 324,
"renameat2" => 316,
"copy_file_range" => 326,
"execveat" => 322,
"rseq" => 334,
"clone3" => 435,
"faccessat2" => 439,
"statx" => 332,
"set_robust_list" => 273,
"get_robust_list" => 274,
"fchownat" => 260,
"getxattr" => 191,
"lgetxattr" => 192,
"fgetxattr" => 193,
"listxattr" => 194,
"llistxattr" => 195,
"flistxattr" => 196,
"prctl" => 157,
"getrlimit" => 97,
_ => return None,
};
Some(n)
}
#[cfg(target_arch = "aarch64")]
pub fn syscall_number(name: &str) -> Option<u32> {
let n = match name {
// generic numbers (asm-generic/unistd.h)
"io_setup" => 0,
"getcwd" => 17,
"lookup_dcookie" => 18,
"eventfd2" => 19,
"epoll_create1" => 20,
"epoll_ctl" => 21,
"epoll_pwait" => 22,
"dup" => 23,
"dup3" => 24,
"fcntl" => 25,
"ioctl" => 29,
"flock" => 32,
"mkdirat" => 34,
"unlinkat" => 35,
"symlinkat" => 36,
"linkat" => 37,
"renameat" => 38,
"fallocate" => 47,
"faccessat" => 48,
"chdir" => 49,
"openat" => 56,
"close" => 57,
"pipe2" => 59,
"getdents64" => 61,
"lseek" => 62,
"read" => 63,
"write" => 64,
"readv" => 65,
"writev" => 66,
"pread64" => 67,
"pwrite64" => 68,
"ppoll" => 73,
"pselect6" => 72,
"sendfile" => 71,
"fdatasync" => 83,
"fsync" => 82,
"ftruncate" => 46,
"newfstatat" => 79,
"fstat" => 80,
"exit" => 93,
"exit_group" => 94,
"waitid" => 95,
"set_tid_address" => 96,
"futex" => 98,
"set_robust_list" => 99,
"get_robust_list" => 100,
"nanosleep" => 101,
"getpid" => 172,
"gettid" => 178,
"uname" => 160,
"kill" => 129,
"tgkill" => 131,
"rt_sigaction" => 134,
"rt_sigprocmask" => 135,
"rt_sigreturn" => 139,
"sigaltstack" => 132,
"getrandom" => 278,
"membarrier" => 283,
"renameat2" => 276,
"copy_file_range" => 285,
"statx" => 291,
"execveat" => 281,
"rseq" => 293,
"clone3" => 435,
"faccessat2" => 439,
"epoll_pwait2" => 441,
"rt_sigtimedwait" => 137,
"rt_sigsuspend" => 133,
"clone" => 220,
"execve" => 221,
"mmap" => 222,
"fadvise64" => 223,
"mprotect" => 226,
"msync" => 227,
"mlock" => 228,
"munlock" => 229,
"munmap" => 215,
"brk" => 214,
"mremap" => 216,
"madvise" => 233,
"wait4" => 260,
"prlimit64" => 261,
"getrlimit" => 163,
"prctl" => 167,
"fchmod" => 52,
"fchmodat" => 53,
"fchown" => 55,
"fchownat" => 54,
"getuid" => 174,
"geteuid" => 175,
"getgid" => 176,
"getegid" => 177,
"socket" => 198,
"bind" => 200,
"listen" => 201,
"accept" => 202,
"connect" => 203,
"getsockname" => 204,
"getpeername" => 205,
"sendto" => 206,
"recvfrom" => 207,
"setsockopt" => 208,
"getsockopt" => 209,
"shutdown" => 210,
"sendmsg" => 211,
"recvmsg" => 212,
"accept4" => 242,
"sched_setaffinity" => 122,
"sched_getaffinity" => 123,
"sched_yield" => 124,
"clock_gettime" => 113,
"clock_getres" => 114,
"clock_nanosleep" => 115,
"epoll_create" => 20, // alias to epoll_create1 on generic
"epoll_wait" => 22, // alias to epoll_pwait on generic
"openat2" => 437,
"readlinkat" => 78,
"utimensat" => 88,
"getxattr" => 8,
"lgetxattr" => 9,
"fgetxattr" => 10,
"listxattr" => 11,
"llistxattr" => 12,
"flistxattr" => 13,
_ => return None,
};
Some(n)
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
pub fn syscall_number(_name: &str) -> Option<u32> {
None
}
/// AUDIT_ARCH constant matching the running architecture.
#[cfg(target_arch = "x86_64")]
pub const AUDIT_ARCH: u32 = 0xc000_003e;
#[cfg(target_arch = "aarch64")]
pub const AUDIT_ARCH: u32 = 0xc000_00b7;
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
pub const AUDIT_ARCH: u32 = 0;