From 22e54a6ca990fd463651fd78e61782acace2d11b Mon Sep 17 00:00:00 2001 From: Adrien Langou Date: Thu, 25 Jun 2026 15:11:29 +0200 Subject: [PATCH] fix(supervisor): drop sandbox child capability bounding set Reduce the Linux capability bounding set in the common privilege-drop path before executing sandbox workloads or connect shells. Signed-off-by: Adrien Langou --- architecture/sandbox.md | 4 +- .../src/process.rs | 171 ++++++++++++++++++ 2 files changed, 174 insertions(+), 1 deletion(-) diff --git a/architecture/sandbox.md b/architecture/sandbox.md index 2552304e1..e735f0db1 100644 --- a/architecture/sandbox.md +++ b/architecture/sandbox.md @@ -14,7 +14,9 @@ Each sandbox workload has two trust levels: | Agent child | Runs as an unprivileged user with filesystem, process, and network restrictions applied. | The supervisor keeps enough privilege to manage the sandbox, but the agent child -loses that privilege before user code runs. +loses that privilege before user code runs. On Linux, child setup also removes +the capability bounding set before executing the workload or SSH shell so later +execs cannot regain container-granted capabilities. ## Startup Flow diff --git a/crates/openshell-supervisor-process/src/process.rs b/crates/openshell-supervisor-process/src/process.rs index 9f9fe1822..adfc9b9c4 100644 --- a/crates/openshell-supervisor-process/src/process.rs +++ b/crates/openshell-supervisor-process/src/process.rs @@ -79,6 +79,14 @@ pub fn harden_child_process() -> Result<()> { #[cfg(target_os = "linux")] const CGROUP_PIDS_MAX_PATH: &str = "/sys/fs/cgroup/pids.max"; +#[cfg(target_os = "linux")] +const PROC_CAP_LAST_CAP_PATH: &str = "/proc/sys/kernel/cap_last_cap"; +#[cfg(target_os = "linux")] +const PROC_SELF_STATUS_PATH: &str = "/proc/self/status"; +#[cfg(target_os = "linux")] +const LINUX_CAP_SETPCAP: u32 = 8; +#[cfg(target_os = "linux")] +const LINUX_CAP_LAST_CAP_FALLBACK: u32 = 40; #[cfg(target_os = "linux")] #[derive(Debug, Clone, PartialEq, Eq)] @@ -155,6 +163,88 @@ fn parse_pids_max(contents: &str) -> RuntimePidLimitStatus { } } +#[cfg(target_os = "linux")] +fn parse_cap_last_cap(contents: &str) -> Option { + contents.trim().parse::().ok() +} + +#[cfg(target_os = "linux")] +fn kernel_cap_last_cap() -> u32 { + std::fs::read_to_string(PROC_CAP_LAST_CAP_PATH) + .ok() + .and_then(|contents| parse_cap_last_cap(&contents)) + .unwrap_or(LINUX_CAP_LAST_CAP_FALLBACK) +} + +#[cfg(target_os = "linux")] +fn capability_drop_order(cap_last_cap: u32) -> Vec { + let mut caps = (0..=cap_last_cap) + .filter(|cap| *cap != LINUX_CAP_SETPCAP) + .collect::>(); + + if LINUX_CAP_SETPCAP <= cap_last_cap { + caps.push(LINUX_CAP_SETPCAP); + } + + caps +} + +#[cfg(target_os = "linux")] +fn parse_proc_status_cap(status: &str, field: &str) -> Option { + status.lines().find_map(|line| { + let value = line.strip_prefix(field)?.strip_prefix(':')?.trim(); + u64::from_str_radix(value, 16).ok() + }) +} + +#[cfg(target_os = "linux")] +fn proc_self_status_cap(field: &str) -> Option { + std::fs::read_to_string(PROC_SELF_STATUS_PATH) + .ok() + .and_then(|status| parse_proc_status_cap(&status, field)) +} + +#[cfg(target_os = "linux")] +fn effective_capability_is_set(cap: u32) -> Option { + let mask = proc_self_status_cap("CapEff")?; + if cap >= u64::BITS { + return Some(false); + } + Some((mask & (1_u64 << cap)) != 0) +} + +#[cfg(target_os = "linux")] +fn drop_capability_bounding_set() -> Result<()> { + if matches!(effective_capability_is_set(LINUX_CAP_SETPCAP), Some(false)) { + tracing::warn!( + "CAP_SETPCAP is not effective; leaving sandbox child capability bounding set unchanged" + ); + return Ok(()); + } + + let mut dropped_any = false; + for cap in capability_drop_order(kernel_cap_last_cap()) { + #[allow(unsafe_code)] + let rc = unsafe { libc::prctl(libc::PR_CAPBSET_DROP, libc::c_ulong::from(cap), 0, 0, 0) }; + if rc != 0 { + let err = std::io::Error::last_os_error(); + if !dropped_any && err.raw_os_error() == Some(libc::EPERM) { + tracing::warn!( + cap, + "CAP_SETPCAP is unavailable; leaving sandbox child capability bounding set unchanged" + ); + return Ok(()); + } + return Err(miette::miette!( + "Failed to drop capability {cap} from child bounding set: {err}" + )); + } + dropped_any = true; + } + + Ok(()) +} + // Pins the pre-seccomp child mount namespace where supervisor identity sockets // are shadowed. Children enter it with setns before dropping privileges. #[cfg(target_os = "linux")] @@ -969,6 +1059,9 @@ pub fn drop_privileges(policy: &SandboxPolicy) -> Result<()> { )); } + #[cfg(target_os = "linux")] + drop_capability_bounding_set()?; + if user_name.is_some() { nix::unistd::setuid(user.uid).into_diagnostic()?; @@ -1083,6 +1176,39 @@ mod tests { ); } + #[test] + #[cfg(target_os = "linux")] + fn parse_cap_last_cap_accepts_kernel_value() { + assert_eq!(parse_cap_last_cap("40\n"), Some(40)); + assert_eq!(parse_cap_last_cap("not-a-number\n"), None); + } + + #[test] + #[cfg(target_os = "linux")] + fn capability_drop_order_places_cap_setpcap_last() { + let order = capability_drop_order(LINUX_CAP_SETPCAP + 2); + + assert_eq!(order.last(), Some(&LINUX_CAP_SETPCAP)); + assert_eq!(order.len(), usize::try_from(LINUX_CAP_SETPCAP + 3).unwrap()); + assert_eq!( + order + .iter() + .filter(|cap| **cap == LINUX_CAP_SETPCAP) + .count(), + 1 + ); + } + + #[test] + #[cfg(target_os = "linux")] + fn parse_proc_status_cap_reads_hex_value() { + let status = "Name:\ttest\nCapEff:\t0000000000000100\nCapBnd:\t0000000000000000\n"; + + assert_eq!(parse_proc_status_cap(status, "CapEff"), Some(0x100)); + assert_eq!(parse_proc_status_cap(status, "CapBnd"), Some(0)); + assert_eq!(parse_proc_status_cap(status, "CapAmb"), None); + } + #[test] fn drop_privileges_noop_when_no_user_or_group() { let policy = policy_with_process(ProcessPolicy { @@ -1133,6 +1259,51 @@ mod tests { assert!(drop_privileges(&policy).is_ok()); } + #[test] + #[cfg(target_os = "linux")] + #[allow(unsafe_code)] + fn drop_privileges_clears_bounding_set_for_spawned_child_when_permitted() { + use std::os::unix::process::CommandExt; + + if !matches!(effective_capability_is_set(LINUX_CAP_SETPCAP), Some(true)) { + eprintln!("skipping: CAP_SETPCAP is not effective in this test environment"); + return; + } + + let current_group = Group::from_gid(nix::unistd::getegid()) + .expect("getgrgid") + .expect("current group entry"); + + let policy = policy_with_process(ProcessPolicy { + run_as_user: None, + run_as_group: Some(current_group.name), + }); + + let mut cmd = std::process::Command::new("/bin/sh"); + cmd.arg("-c") + .arg("grep '^CapBnd:' /proc/self/status") + .stdin(StdStdio::null()) + .stdout(StdStdio::piped()) + .stderr(StdStdio::piped()); + + unsafe { + cmd.pre_exec(move || { + drop_privileges(&policy).map_err(|err| std::io::Error::other(err.to_string())) + }); + } + + let output = cmd.output().expect("spawn child status probe"); + assert!( + output.status.success(), + "status probe failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8(output.stdout).expect("utf8 stdout"); + let cap_bnd = parse_proc_status_cap(&stdout, "CapBnd").expect("CapBnd in child status"); + + assert_eq!(cap_bnd, 0, "child CapBnd should be empty after exec"); + } + #[test] #[ignore = "initgroups(3) requires CAP_SETGID; run as root: sudo cargo test -- --ignored"] fn drop_privileges_succeeds_for_current_user() {