NVIDIA · alangou · Jun 25, 2026 · elezar · Jun 25, 2026 · alangou
@@ -14,7 +14,9 @@ Each sandbox workload has two trust levels:
 | Agent child | Runs as an unprivileged user with filesystem, process, and network restrictions applied. |
 
 The supervisor keeps enough privilege to manage the sandbox, but the agent child
-loses that privilege before user code runs.
+loses that privilege before user code runs. On Linux, child setup also removes
+the capability bounding set before executing the workload or SSH shell so later
+execs cannot regain container-granted capabilities.
 
 ## Startup Flow
 

@@ -79,6 +79,14 @@ pub fn harden_child_process() -> Result<()> {
 
 #[cfg(target_os = "linux")]
 const CGROUP_PIDS_MAX_PATH: &str = "/sys/fs/cgroup/pids.max";
+#[cfg(target_os = "linux")]
+const PROC_CAP_LAST_CAP_PATH: &str = "/proc/sys/kernel/cap_last_cap";
+#[cfg(target_os = "linux")]
+const PROC_SELF_STATUS_PATH: &str = "/proc/self/status";
+#[cfg(target_os = "linux")]
+const LINUX_CAP_SETPCAP: u32 = 8;
+#[cfg(target_os = "linux")]
+const LINUX_CAP_LAST_CAP_FALLBACK: u32 = 40;
 
 #[cfg(target_os = "linux")]
 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -155,6 +163,88 @@ fn parse_pids_max(contents: &str) -> RuntimePidLimitStatus {
     }
 }
 
+#[cfg(target_os = "linux")]
+fn parse_cap_last_cap(contents: &str) -> Option<u32> {
+    contents.trim().parse::<u32>().ok()
+}
+
+#[cfg(target_os = "linux")]
+fn kernel_cap_last_cap() -> u32 {
+    std::fs::read_to_string(PROC_CAP_LAST_CAP_PATH)
+        .ok()
+        .and_then(|contents| parse_cap_last_cap(&contents))
+        .unwrap_or(LINUX_CAP_LAST_CAP_FALLBACK)
+}
+
+#[cfg(target_os = "linux")]
+fn capability_drop_order(cap_last_cap: u32) -> Vec<u32> {
+    let mut caps = (0..=cap_last_cap)
+        .filter(|cap| *cap != LINUX_CAP_SETPCAP)
+        .collect::<Vec<_>>();
+
+    if LINUX_CAP_SETPCAP <= cap_last_cap {
+        caps.push(LINUX_CAP_SETPCAP);
+    }
+
+    caps
+}
+
+#[cfg(target_os = "linux")]
+fn parse_proc_status_cap(status: &str, field: &str) -> Option<u64> {
+    status.lines().find_map(|line| {
+        let value = line.strip_prefix(field)?.strip_prefix(':')?.trim();
+        u64::from_str_radix(value, 16).ok()
+    })
+}
+
+#[cfg(target_os = "linux")]
+fn proc_self_status_cap(field: &str) -> Option<u64> {
+    std::fs::read_to_string(PROC_SELF_STATUS_PATH)
+        .ok()
+        .and_then(|status| parse_proc_status_cap(&status, field))
+}
+
+#[cfg(target_os = "linux")]
+fn effective_capability_is_set(cap: u32) -> Option<bool> {
+    let mask = proc_self_status_cap("CapEff")?;
+    if cap >= u64::BITS {
+        return Some(false);
+    }
+    Some((mask & (1_u64 << cap)) != 0)
+}
+
+#[cfg(target_os = "linux")]
+fn drop_capability_bounding_set() -> Result<()> {
+    if matches!(effective_capability_is_set(LINUX_CAP_SETPCAP), Some(false)) {
+        tracing::warn!(
+            "CAP_SETPCAP is not effective; leaving sandbox child capability bounding set unchanged"
+        );
+        return Ok(());
+    }
+
+    let mut dropped_any = false;
+    for cap in capability_drop_order(kernel_cap_last_cap()) {
+        #[allow(unsafe_code)]
+        let rc = unsafe { libc::prctl(libc::PR_CAPBSET_DROP, libc::c_ulong::from(cap), 0, 0, 0) };
+        if rc != 0 {
+            let err = std::io::Error::last_os_error();
+            if !dropped_any && err.raw_os_error() == Some(libc::EPERM) {
+                tracing::warn!(
+                    cap,
+                    "CAP_SETPCAP is unavailable; leaving sandbox child capability bounding set unchanged"
+                );
+                return Ok(());
+            }
+            return Err(miette::miette!(
+                "Failed to drop capability {cap} from child bounding set: {err}"
+            ));
+        }
+        dropped_any = true;
+    }
+
+    Ok(())
+}
+
 // Pins the pre-seccomp child mount namespace where supervisor identity sockets
 // are shadowed. Children enter it with setns before dropping privileges.
 #[cfg(target_os = "linux")]
@@ -969,6 +1059,9 @@ pub fn drop_privileges(policy: &SandboxPolicy) -> Result<()> {
         ));
     }
 
+    #[cfg(target_os = "linux")]
+    drop_capability_bounding_set()?;
+
     if user_name.is_some() {
         nix::unistd::setuid(user.uid).into_diagnostic()?;
 
@@ -1083,6 +1176,39 @@ mod tests {
         );
     }
 
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn parse_cap_last_cap_accepts_kernel_value() {
+        assert_eq!(parse_cap_last_cap("40\n"), Some(40));
+        assert_eq!(parse_cap_last_cap("not-a-number\n"), None);
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn capability_drop_order_places_cap_setpcap_last() {
+        let order = capability_drop_order(LINUX_CAP_SETPCAP + 2);
+
+        assert_eq!(order.last(), Some(&LINUX_CAP_SETPCAP));
+        assert_eq!(order.len(), usize::try_from(LINUX_CAP_SETPCAP + 3).unwrap());
+        assert_eq!(
+            order
+                .iter()
+                .filter(|cap| **cap == LINUX_CAP_SETPCAP)
+                .count(),
+            1
+        );
+    }
+
+    #[test]
+    #[cfg(target_os = "linux")]
+    fn parse_proc_status_cap_reads_hex_value() {
+        let status = "Name:\ttest\nCapEff:\t0000000000000100\nCapBnd:\t0000000000000000\n";
+
+        assert_eq!(parse_proc_status_cap(status, "CapEff"), Some(0x100));
+        assert_eq!(parse_proc_status_cap(status, "CapBnd"), Some(0));
+        assert_eq!(parse_proc_status_cap(status, "CapAmb"), None);
+    }
+
     #[test]
     fn drop_privileges_noop_when_no_user_or_group() {
         let policy = policy_with_process(ProcessPolicy {
@@ -1133,6 +1259,51 @@ mod tests {
         assert!(drop_privileges(&policy).is_ok());
     }
 
+    #[test]
+    #[cfg(target_os = "linux")]
+    #[allow(unsafe_code)]
+    fn drop_privileges_clears_bounding_set_for_spawned_child_when_permitted() {
+        use std::os::unix::process::CommandExt;
+
+        if !matches!(effective_capability_is_set(LINUX_CAP_SETPCAP), Some(true)) {
+            eprintln!("skipping: CAP_SETPCAP is not effective in this test environment");
+            return;
+        }
+
+        let current_group = Group::from_gid(nix::unistd::getegid())
+            .expect("getgrgid")
+            .expect("current group entry");
+
+        let policy = policy_with_process(ProcessPolicy {
+            run_as_user: None,
+            run_as_group: Some(current_group.name),
+        });
+
+        let mut cmd = std::process::Command::new("/bin/sh");
+        cmd.arg("-c")
+            .arg("grep '^CapBnd:' /proc/self/status")
+            .stdin(StdStdio::null())
+            .stdout(StdStdio::piped())
+            .stderr(StdStdio::piped());
+
+        unsafe {
+            cmd.pre_exec(move || {
+                drop_privileges(&policy).map_err(|err| std::io::Error::other(err.to_string()))
+            });
+        }
+
+        let output = cmd.output().expect("spawn child status probe");
+        assert!(
+            output.status.success(),
+            "status probe failed: {}",
+            String::from_utf8_lossy(&output.stderr)
+        );
+        let stdout = String::from_utf8(output.stdout).expect("utf8 stdout");
+        let cap_bnd = parse_proc_status_cap(&stdout, "CapBnd").expect("CapBnd in child status");
+
+        assert_eq!(cap_bnd, 0, "child CapBnd should be empty after exec");
+    }
+
     #[test]
     #[ignore = "initgroups(3) requires CAP_SETGID; run as root: sudo cargo test -- --ignored"]
     fn drop_privileges_succeeds_for_current_user() {