diff --git a/crates/sandlock-core/src/arch.rs b/crates/sandlock-core/src/arch.rs index 89969e4..c18492e 100644 --- a/crates/sandlock-core/src/arch.rs +++ b/crates/sandlock-core/src/arch.rs @@ -8,6 +8,7 @@ mod imp { pub const SYS_MEMFD_CREATE: i64 = 319; pub const SYS_PIDFD_OPEN: i64 = 434; pub const SYS_PIDFD_GETFD: i64 = 438; + pub const SYS_OPENAT2: i64 = 437; pub const SYS_OPEN: Option = Some(libc::SYS_open); pub const SYS_STAT: Option = Some(libc::SYS_stat); @@ -51,6 +52,7 @@ mod imp { pub const SYS_MEMFD_CREATE: i64 = 279; pub const SYS_PIDFD_OPEN: i64 = 434; pub const SYS_PIDFD_GETFD: i64 = 438; + pub const SYS_OPENAT2: i64 = 437; pub const SYS_OPEN: Option = None; pub const SYS_STAT: Option = None; diff --git a/crates/sandlock-core/src/context.rs b/crates/sandlock-core/src/context.rs index c03282d..a623fa4 100644 --- a/crates/sandlock-core/src/context.rs +++ b/crates/sandlock-core/src/context.rs @@ -308,8 +308,18 @@ pub fn notif_syscalls(policy: &Sandbox, sandbox_name: Option<&str>) -> Vec nrs.push(libc::SYS_openat as u32); } - // /proc virtualization (always on: PID filtering, sensitive path blocking) + // /proc virtualization + /etc/hosts virtualization (always on). + // + // `openat` carries the simple `(AT_FDCWD, "/proc/...")` and + // `(AT_FDCWD, "/etc/hosts")` spellings; `openat2` is the same shape + // on newer libcs; legacy `open(path, ...)` is the same path without a + // dirfd. The handlers normalize all three into a single absolute path + // check, so we have to put every variant on the notif list — otherwise + // a caller that picks `open` or `openat2` slips past virtualization + // and reads the real on-disk file. nrs.push(libc::SYS_openat as u32); + nrs.push(arch::SYS_OPENAT2 as u32); + arch::push_optional_syscall(&mut nrs, arch::SYS_OPEN); nrs.push(libc::SYS_getdents64 as u32); arch::push_optional_syscall(&mut nrs, arch::SYS_GETDENTS); diff --git a/crates/sandlock-core/src/network.rs b/crates/sandlock-core/src/network.rs index c0bdaa6..2310895 100644 --- a/crates/sandlock-core/src/network.rs +++ b/crates/sandlock-core/src/network.rs @@ -1037,9 +1037,12 @@ pub struct ResolvedNetAllowSet { pub tcp: ResolvedNetAllow, pub udp: ResolvedNetAllow, pub icmp: ResolvedNetAllow, - /// Synthetic `/etc/hosts` content combining every concrete host - /// across all protocols. `None` when no concrete hostnames appear. - pub etc_hosts: Option, + /// ` \n` lines from every concrete-host rule across + /// every protocol, in resolution order. Empty when no concrete-host + /// rules are present. Combined with the loopback base (or, in chroot + /// mode, the image's `/etc/hosts`) by [`compose_virtual_etc_hosts`] + /// to build the synthetic file served to the sandbox. + pub concrete_host_entries: String, } /// Resolve `--net-allow` rules into per-protocol runtime allowlists. @@ -1053,18 +1056,12 @@ pub struct ResolvedNetAllowSet { pub async fn resolve_net_allow( rules: &[NetAllow], ) -> io::Result { - // Single shared etc_hosts for all protocols. Every concrete host - // (regardless of protocol) ends up resolvable in the sandbox. - let mut etc_hosts = String::from("127.0.0.1 localhost\n::1 localhost\n"); - let mut has_concrete_host = false; - let per_proto = |target: Protocol| async move { let mut per_ip: HashMap> = HashMap::new(); let mut per_ip_all_ports: HashSet = HashSet::new(); let mut any_ip_ports: HashSet = HashSet::new(); let mut any_ip_all_ports = false; let mut local_etc_hosts = String::new(); - let mut local_has_concrete = false; for rule in rules.iter().filter(|r| r.protocol == target) { match &rule.host { @@ -1080,7 +1077,6 @@ pub async fn resolve_net_allow( } } Some(host) => { - local_has_concrete = true; let addr = format!("{}:0", host); let resolved = tokio::net::lookup_host(addr.as_str()).await.map_err(|e| { io::Error::new( @@ -1113,27 +1109,85 @@ pub async fn resolve_net_allow( any_ip_all_ports, }, local_etc_hosts, - local_has_concrete, )) }; - let (tcp, tcp_eh, tcp_concrete) = per_proto(Protocol::Tcp).await?; - let (udp, udp_eh, udp_concrete) = per_proto(Protocol::Udp).await?; - let (icmp, icmp_eh, icmp_concrete) = per_proto(Protocol::Icmp).await?; + let (tcp, tcp_eh) = per_proto(Protocol::Tcp).await?; + let (udp, udp_eh) = per_proto(Protocol::Udp).await?; + let (icmp, icmp_eh) = per_proto(Protocol::Icmp).await?; + let mut concrete_host_entries = String::new(); for chunk in [tcp_eh, udp_eh, icmp_eh] { - etc_hosts.push_str(&chunk); + concrete_host_entries.push_str(&chunk); } - has_concrete_host |= tcp_concrete || udp_concrete || icmp_concrete; Ok(ResolvedNetAllowSet { tcp, udp, icmp, - etc_hosts: if has_concrete_host { Some(etc_hosts) } else { None }, + concrete_host_entries, }) } +/// Compose the synthetic `/etc/hosts` served to the sandbox. +/// +/// - **No chroot**: emit the fixed loopback base +/// (`127.0.0.1 localhost\n::1 localhost\n`) followed by the +/// concrete-host entries from [`resolve_net_allow`]. The sandbox sees +/// the same baseline regardless of what the host's on-disk file says. +/// - **With chroot**: read `/etc/hosts` and use it as the base +/// (an image that bakes in private-registry entries or similar keeps +/// them). Inject loopback entries only for any localhost family the +/// image doesn't already cover — never both, so we don't duplicate +/// what the image already has. Concrete-host entries are still +/// appended on top. +/// +/// If a chroot is set but `/etc/hosts` is unreadable (absent, +/// permission denied, etc.), fall back to the bare loopback base — the +/// sandbox always sees a usable hosts file. +pub fn compose_virtual_etc_hosts( + chroot_root: Option<&std::path::Path>, + concrete_host_entries: &str, +) -> String { + let mut out = String::new(); + let mut has_v4_localhost = false; + let mut has_v6_localhost = false; + + if let Some(root) = chroot_root { + if let Ok(image) = std::fs::read_to_string(root.join("etc").join("hosts")) { + for line in image.lines() { + // Strip an inline `#` comment before tokenizing — the + // hosts(5) format treats everything after `#` as a comment. + let stripped = line.split('#').next().unwrap_or(""); + let mut parts = stripped.split_whitespace(); + let Some(ip) = parts.next() else { continue }; + for name in parts { + if name == "localhost" { + if ip == "127.0.0.1" { + has_v4_localhost = true; + } else if ip == "::1" { + has_v6_localhost = true; + } + } + } + } + out.push_str(&image); + if !out.is_empty() && !out.ends_with('\n') { + out.push('\n'); + } + } + } + + if !has_v4_localhost { + out.push_str("127.0.0.1 localhost\n"); + } + if !has_v6_localhost { + out.push_str("::1 localhost\n"); + } + out.push_str(concrete_host_entries); + out +} + // ============================================================ // Tests // ============================================================ @@ -1318,7 +1372,8 @@ mod tests { assert!(resolved.tcp.any_ip_ports.is_empty()); assert!(resolved.udp.per_ip.is_empty()); assert!(resolved.icmp.per_ip.is_empty()); - assert!(resolved.etc_hosts.is_none()); + // No concrete-host rules → no resolved-entry lines. + assert!(resolved.concrete_host_entries.is_empty()); } #[tokio::test] @@ -1339,7 +1394,8 @@ mod tests { } assert!(resolved.udp.per_ip.is_empty()); assert!(resolved.icmp.per_ip.is_empty()); - assert!(resolved.etc_hosts.as_deref().unwrap_or("").contains("localhost")); + // The resolved entry (` localhost`) surfaces in concrete_host_entries. + assert!(resolved.concrete_host_entries.contains("127.0.0.1 localhost")); } #[tokio::test] @@ -1354,7 +1410,8 @@ mod tests { assert!(resolved.tcp.per_ip.is_empty()); assert!(resolved.tcp.any_ip_ports.contains(&8080)); assert!(!resolved.tcp.any_ip_all_ports); - assert!(resolved.etc_hosts.is_none()); + // Any-IP rule has no concrete host, so no resolved-entry line. + assert!(resolved.concrete_host_entries.is_empty()); } #[tokio::test] @@ -1394,7 +1451,7 @@ mod tests { for ip in resolved.tcp.per_ip_all_ports.iter() { assert!(resolved.tcp.per_ip.contains_key(ip)); } - assert!(resolved.etc_hosts.is_some()); + assert!(resolved.concrete_host_entries.contains("localhost")); } #[tokio::test] @@ -1499,4 +1556,108 @@ mod tests { assert!(resolved.icmp.any_ip_all_ports); assert!(!resolved.tcp.any_ip_all_ports); } + + // ============================================================ + // compose_virtual_etc_hosts — synthetic /etc/hosts assembly + // ============================================================ + + use std::io::Write; + + fn temp_rootfs_with_hosts(name: &str, hosts_content: Option<&str>) -> std::path::PathBuf { + let dir = std::env::temp_dir().join(format!( + "sandlock-test-compose-hosts-{}-{}", + name, std::process::id() + )); + let _ = std::fs::create_dir_all(dir.join("etc")); + if let Some(content) = hosts_content { + let mut f = std::fs::File::create(dir.join("etc").join("hosts")).unwrap(); + f.write_all(content.as_bytes()).unwrap(); + } + dir + } + + #[test] + fn compose_no_chroot_emits_loopback_base() { + // Default path — no chroot, no concrete-host rules → the same + // fixed loopback view we promise every sandbox. + let out = compose_virtual_etc_hosts(None, ""); + assert_eq!(out, "127.0.0.1 localhost\n::1 localhost\n"); + } + + #[test] + fn compose_no_chroot_appends_concrete_entries() { + let out = compose_virtual_etc_hosts(None, "10.0.0.1 api\n"); + assert_eq!(out, "127.0.0.1 localhost\n::1 localhost\n10.0.0.1 api\n"); + } + + #[test] + fn compose_chroot_seeds_from_image_and_injects_missing_loopback() { + // Image ships an entry of its own but no localhost mapping; the + // shim must keep the image's content and inject both loopback + // entries on top so the always-on guarantee still holds. + let rootfs = temp_rootfs_with_hosts( + "no-localhost", + Some("10.0.0.5 myimage.local\n"), + ); + let out = compose_virtual_etc_hosts(Some(&rootfs), ""); + assert!(out.contains("10.0.0.5 myimage.local"), "image entry missing: {out}"); + assert!(out.contains("127.0.0.1 localhost"), "v4 loopback missing: {out}"); + assert!(out.contains("::1 localhost"), "v6 loopback missing: {out}"); + let _ = std::fs::remove_dir_all(&rootfs); + } + + #[test] + fn compose_chroot_does_not_duplicate_existing_loopback() { + // Image already has both loopback entries — don't append duplicates. + let rootfs = temp_rootfs_with_hosts( + "both-localhost", + Some("127.0.0.1 localhost\n::1 localhost\n10.0.0.5 myimage.local\n"), + ); + let out = compose_virtual_etc_hosts(Some(&rootfs), ""); + assert_eq!(out.matches("127.0.0.1 localhost").count(), 1, "v4 dup'd: {out}"); + assert_eq!(out.matches("::1 localhost").count(), 1, "v6 dup'd: {out}"); + assert!(out.contains("10.0.0.5 myimage.local")); + let _ = std::fs::remove_dir_all(&rootfs); + } + + #[test] + fn compose_chroot_injects_only_missing_family() { + // Image has v4 but no v6 localhost — inject only v6, leave v4 alone. + let rootfs = temp_rootfs_with_hosts( + "only-v4-localhost", + Some("127.0.0.1 localhost myimage\n"), + ); + let out = compose_virtual_etc_hosts(Some(&rootfs), ""); + assert_eq!(out.matches("127.0.0.1 localhost").count(), 1); + assert!(out.contains("::1 localhost"), "v6 loopback should be injected: {out}"); + let _ = std::fs::remove_dir_all(&rootfs); + } + + #[test] + fn compose_chroot_missing_file_falls_back_to_loopback() { + // Chroot exists but has no /etc/hosts — fall back to the bare + // loopback base so the sandbox always sees a usable file. + let rootfs = temp_rootfs_with_hosts("no-file", None); + let out = compose_virtual_etc_hosts(Some(&rootfs), "10.0.0.1 api\n"); + assert_eq!(out, "127.0.0.1 localhost\n::1 localhost\n10.0.0.1 api\n"); + let _ = std::fs::remove_dir_all(&rootfs); + } + + #[test] + fn compose_chroot_strips_inline_comments_when_detecting_loopback() { + // hosts(5) treats `#` as a comment-start; the loopback-presence + // check must respect it (otherwise an image line like + // `127.0.0.1 # localhost` would be falsely treated as covering v4). + let rootfs = temp_rootfs_with_hosts( + "with-comments", + Some("127.0.0.1 # localhost is a comment here\n"), + ); + let out = compose_virtual_etc_hosts(Some(&rootfs), ""); + // Real `127.0.0.1 localhost` line must still be injected. + assert!( + out.lines().any(|l| l.trim() == "127.0.0.1 localhost"), + "v4 loopback should still be injected: {out}" + ); + let _ = std::fs::remove_dir_all(&rootfs); + } } diff --git a/crates/sandlock-core/src/procfs.rs b/crates/sandlock-core/src/procfs.rs index 9907aa5..09b2ded 100644 --- a/crates/sandlock-core/src/procfs.rs +++ b/crates/sandlock-core/src/procfs.rs @@ -411,16 +411,21 @@ pub(crate) async fn handle_proc_open( policy: &NotifPolicy, notif_fd: RawFd, ) -> NotifAction { - // openat(dirfd, pathname, flags, mode) - // args[0] = dirfd, args[1] = pathname pointer - let path_ptr = notif.data.args[1]; - let path = match read_path(notif, path_ptr, notif_fd) { + // Resolve open/openat/openat2 to a normalized absolute path so the + // sensitive-path deny, the per-PID filter, and the virtualization + // string-matches below all see the same canonical form regardless of + // how the caller spelled it (dirfd-relative, `..`-laden, etc.). + let resolved = match resolve_open_target(notif, notif_fd) { + Some(p) => p, + None => return NotifAction::Continue, + }; + let path = match resolved.to_str() { Some(p) => p, None => return NotifAction::Continue, }; // Block sensitive paths. - if is_sensitive_proc(&path) { + if is_sensitive_proc(path) { return NotifAction::Errno(EACCES); } @@ -428,7 +433,7 @@ pub(crate) async fn handle_proc_open( // This complements the getdents64 PID filtering — directory listings // already hide non-sandbox PIDs, but without this check a process // could still open /proc/{ppid}/cmdline (or any guessed PID) directly. - if let Some(pid) = extract_proc_pid(&path) { + if let Some(pid) = extract_proc_pid(path) { if !processes.contains(pid) { return NotifAction::Errno(EACCES); } @@ -586,44 +591,127 @@ pub(crate) fn handle_uname( } } -/// Handle openat targeting /etc/hostname — return a memfd with the virtual hostname. +/// Handle open/openat/openat2 targeting /etc/hostname — return a memfd +/// with the virtual hostname. Path is resolved and lexically normalized +/// via [`resolve_open_target`] so dirfd-relative and non-canonical +/// spellings all hit the shim. pub(crate) fn handle_hostname_open( notif: &SeccompNotif, hostname: &str, notif_fd: RawFd, ) -> Option { - let path_ptr = notif.data.args[1]; - let path = read_path(notif, path_ptr, notif_fd)?; - - if path != "/etc/hostname" { + let resolved = resolve_open_target(notif, notif_fd)?; + if resolved != std::path::Path::new("/etc/hostname") { return None; } - let content = format!("{}\n", hostname); Some(inject_memfd(content.as_bytes())) } -/// Intercept `openat("/etc/hosts")` and return a memfd with virtual content. +/// Intercept any `open`/`openat`/`openat2` of `/etc/hosts` and return a memfd +/// with virtual content. /// -/// When `net_allow` contains concrete hostnames, the supervisor pre-resolves -/// them and builds a synthetic `/etc/hosts`. This lets sandboxed processes -/// resolve allowed hostnames via glibc's `files` NSS backend without contacting -/// a DNS server. +/// Every sandbox gets a fixed loopback view (`127.0.0.1 localhost` / +/// `::1 localhost`) plus any concrete hostnames pre-resolved from +/// `net_allow`, so the host's on-disk `/etc/hosts` never leaks in and +/// glibc's `files` NSS backend resolves allowed hostnames without DNS. pub(crate) fn handle_etc_hosts_open( notif: &SeccompNotif, etc_hosts_content: &str, notif_fd: RawFd, ) -> Option { - let path_ptr = notif.data.args[1]; - let path = read_path(notif, path_ptr, notif_fd)?; - - if path != "/etc/hosts" { + let resolved = resolve_open_target(notif, notif_fd)?; + if resolved != std::path::Path::new("/etc/hosts") { return None; } - Some(inject_memfd(etc_hosts_content.as_bytes())) } +/// Resolve the path argument of an open-family syscall (`open`, `openat`, +/// or `openat2`) to a lexically-normalized absolute host-side path. +/// +/// Used by every `openat`-shaped handler so the security and +/// virtualization checks operate on the same canonical form regardless +/// of how the caller spelled the path. The literal-string compare used +/// before this helper missed four bypass shapes: legacy `open`, +/// `openat2`, dirfd-relative spellings like `openat(open("/etc"), +/// "hosts", ...)`, and non-canonical absolutes like `/etc/../etc/hosts` +/// or `//etc/hosts`. +/// +/// Returns `None` if the notif isn't an open variant, the path can't be +/// read from child memory, the dirfd can't be resolved, or the path +/// walks above `/`. Callers treat `None` as "fall through to the kernel" +/// (`NotifAction::Continue`). +pub(crate) fn resolve_open_target( + notif: &SeccompNotif, + notif_fd: RawFd, +) -> Option { + let nr = notif.data.nr as i64; + let (dirfd, path_ptr): (i64, u64) = if Some(nr) == crate::arch::SYS_OPEN { + // open(path, flags, mode) — no dirfd, behaves as AT_FDCWD. + (libc::AT_FDCWD as i64, notif.data.args[0]) + } else if nr == libc::SYS_openat || nr == crate::arch::SYS_OPENAT2 { + // openat(dirfd, path, ...) and openat2(dirfd, path, ...) share + // the same first two argument slots. + (notif.data.args[0] as i64, notif.data.args[1]) + } else { + return None; + }; + let path = read_path(notif, path_ptr, notif_fd)?; + resolve_to_normalized_absolute(notif.pid, dirfd, &path) +} + +/// Lexical normalization of `(pid, dirfd, path)`: +/// +/// - Absolute `path`: used as-is. +/// - Relative `path` with `dirfd == AT_FDCWD`: prefixed with the child's +/// cwd from `/proc//cwd`. +/// - Relative `path` with explicit `dirfd`: prefixed with the symlink +/// target of `/proc//fd/` (the host kernel's view of the +/// directory the dirfd points to). +/// +/// Then collapses `.`, `..`, and redundant `/` components. Returns +/// `None` if the dirfd cannot be resolved or the path walks above `/`. +fn resolve_to_normalized_absolute( + pid: u32, + dirfd: i64, + path: &str, +) -> Option { + use std::path::{Component, Path, PathBuf}; + + let joined: PathBuf = if Path::new(path).is_absolute() { + PathBuf::from(path) + } else { + let base = if dirfd as i32 == libc::AT_FDCWD { + std::fs::read_link(format!("/proc/{}/cwd", pid)).ok()? + } else { + std::fs::read_link(format!("/proc/{}/fd/{}", pid, dirfd as i32)).ok()? + }; + base.join(path) + }; + + let mut out = PathBuf::new(); + for comp in joined.components() { + match comp { + Component::Prefix(p) => out.push(p.as_os_str()), + Component::RootDir => out.push("/"), + Component::CurDir => {} + Component::ParentDir => { + // pop the last regular component; refuse to walk above + // the root (out becomes empty after popping "/"). + if !out.pop() { + return None; + } + if out.as_os_str().is_empty() { + return None; + } + } + Component::Normal(c) => out.push(c), + } + } + Some(out) +} + // ============================================================ // Deterministic directory listing // ============================================================ diff --git a/crates/sandlock-core/src/random.rs b/crates/sandlock-core/src/random.rs index 05988c1..b84f199 100644 --- a/crates/sandlock-core/src/random.rs +++ b/crates/sandlock-core/src/random.rs @@ -15,7 +15,7 @@ use std::io::{Seek, SeekFrom, Write}; use std::os::fd::RawFd; use std::os::unix::io::{AsRawFd, FromRawFd}; -use crate::seccomp::notif::{read_child_cstr, write_child_mem, NotifAction}; +use crate::seccomp::notif::{write_child_mem, NotifAction}; use crate::sys::structs::SeccompNotif; use crate::sys::syscall; @@ -57,15 +57,12 @@ pub(crate) fn handle_random_open( rng: &mut ChaCha8Rng, notif_fd: RawFd, ) -> Option { - // openat(dirfd, pathname, flags, mode): args[1] = pathname pointer - let path_ptr = notif.data.args[1]; - if path_ptr == 0 { - return None; - } - - let path = read_child_cstr(notif_fd, notif.id, notif.pid, path_ptr, 4096)?; - - if path.as_str() != "/dev/urandom" && path.as_str() != "/dev/random" { + // Resolve the open path so dirfd-relative or non-canonical spellings + // (`/dev/../dev/urandom`, `openat(open("/dev"), "urandom", ...)`) + // can't sidestep the seed and read real kernel entropy. + let resolved = crate::procfs::resolve_open_target(notif, notif_fd)?; + let path = resolved.to_str()?; + if path != "/dev/urandom" && path != "/dev/random" { return None; } diff --git a/crates/sandlock-core/src/resource.rs b/crates/sandlock-core/src/resource.rs index d5ba0de..52a635c 100644 --- a/crates/sandlock-core/src/resource.rs +++ b/crates/sandlock-core/src/resource.rs @@ -653,7 +653,7 @@ mod tests { deterministic_dirs: false, virtual_hostname: None, has_http_acl: false, - virtual_etc_hosts: None, + virtual_etc_hosts: String::new(), } } diff --git a/crates/sandlock-core/src/sandbox.rs b/crates/sandlock-core/src/sandbox.rs index 0d4ceec..0f32d4a 100644 --- a/crates/sandlock-core/src/sandbox.rs +++ b/crates/sandlock-core/src/sandbox.rs @@ -1174,7 +1174,16 @@ impl Sandbox { let resolved_net_allow = network::resolve_net_allow(&self.net_allow) .await .map_err(SandboxRuntimeError::Io)?; - let virtual_etc_hosts = resolved_net_allow.etc_hosts.clone(); + // In chroot/image mode, seed the synthetic /etc/hosts from the + // rootfs's own file so entries baked into the image (private + // registries, internal hostnames, etc.) survive virtualization. + // Without a chroot, the helper returns the fixed loopback base. + // Either way, concrete-host rules from `net_allow` are appended + // on top. + let virtual_etc_hosts = network::compose_virtual_etc_hosts( + self.chroot.as_deref(), + &resolved_net_allow.concrete_host_entries, + ); if !self.http_allow.is_empty() || !self.http_deny.is_empty() { let handle = crate::http_acl::spawn_http_acl_proxy( diff --git a/crates/sandlock-core/src/seccomp/dispatch.rs b/crates/sandlock-core/src/seccomp/dispatch.rs index adb3c02..8cf5740 100644 --- a/crates/sandlock-core/src/seccomp/dispatch.rs +++ b/crates/sandlock-core/src/seccomp/dispatch.rs @@ -152,6 +152,20 @@ pub enum HandlerError { /// The blocklist is whatever [`crate::context::blocklist_syscall_numbers`] /// resolves from Sandlock's default syscall blocklist plus policy extras. /// +/// Every open-family syscall a path-keyed handler must intercept to be +/// leak-proof: `open` (legacy), `openat`, and `openat2`. A handler that +/// only registers `openat` is bypassed by any libc that picks one of +/// the others. The corresponding BPF notif list (in `context::notif_syscalls`) +/// must register the same set so the kernel actually produces a +/// notification — otherwise `RET_ALLOW` makes the handler unreachable. +fn open_family_syscalls() -> Vec { + let mut v = vec![libc::SYS_openat, arch::SYS_OPENAT2]; + if let Some(legacy_open) = arch::SYS_OPEN { + v.push(legacy_open); + } + v +} + /// Takes only the syscall numbers because that's all it needs to check. /// Called from the `run_with_handlers` entry points before any /// handler is registered against the dispatch table. @@ -352,24 +366,29 @@ pub(crate) fn build_dispatch_table( } // ------------------------------------------------------------------ - // Deterministic random — /dev/urandom opens (openat) + // Deterministic random — /dev/urandom and /dev/random opens. + // Registered for every open-family syscall so dirfd-relative and + // legacy `open` spellings can't slip past the seed and read the + // kernel's real entropy. // ------------------------------------------------------------------ if policy.has_random_seed { - let __sup = Arc::clone(ctx); - table.register(libc::SYS_openat, move |cx: &HandlerCtx| { - let notif = cx.notif; - let sup = Arc::clone(&__sup); - let notif_fd = cx.notif_fd; - async move { - let mut tr = sup.time_random.lock().await; - if let Some(ref mut rng) = tr.random_state { - if let Some(action) = crate::random::handle_random_open(¬if, rng, notif_fd) { - return action; + for nr in open_family_syscalls() { + let __sup = Arc::clone(ctx); + table.register(nr, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + let notif_fd = cx.notif_fd; + async move { + let mut tr = sup.time_random.lock().await; + if let Some(ref mut rng) = tr.random_state { + if let Some(action) = crate::random::handle_random_open(¬if, rng, notif_fd) { + return action; + } } + NotifAction::Continue } - NotifAction::Continue - } - }); + }); + } } // ------------------------------------------------------------------ @@ -392,6 +411,38 @@ pub(crate) fn build_dispatch_table( } } + // ------------------------------------------------------------------ + // /etc/hosts virtualization: always on. The synthetic file contains + // the loopback base (or, in chroot/image mode, the image's own + // `/etc/hosts` merged with a loopback fallback) plus any concrete + // hostnames resolved from `net_allow`, so the host's on-disk + // `/etc/hosts` never leaks in and image-baked entries are preserved. + // + // Registered for every open-family syscall (see `open_family_syscalls`). + // Must run *before* the chroot handler so that in chroot mode the + // synthetic memfd wins over a direct open of `/etc/hosts` — + // the chroot handler always intercepts opens within the chroot and + // would otherwise serve the raw image file, defeating the merge. + // ------------------------------------------------------------------ + { + let etc_hosts = policy.virtual_etc_hosts.clone(); + for nr in open_family_syscalls() { + let etc_hosts = etc_hosts.clone(); + table.register(nr, move |cx: &HandlerCtx| { + let notif = cx.notif; + let notif_fd = cx.notif_fd; + let etc_hosts = etc_hosts.clone(); + async move { + if let Some(action) = crate::procfs::handle_etc_hosts_open(¬if, &etc_hosts, notif_fd) { + action + } else { + NotifAction::Continue + } + } + }); + } + } + // ------------------------------------------------------------------ // Chroot path interception (before COW) // ------------------------------------------------------------------ @@ -407,13 +458,15 @@ pub(crate) fn build_dispatch_table( } // ------------------------------------------------------------------ - // /proc virtualization (always on) + // /proc virtualization (always on). The handler does both + // sensitive-path blocking and per-PID filtering, both of which are + // security boundaries, so it has to catch every open-family spelling. // ------------------------------------------------------------------ - { + for nr in open_family_syscalls() { let policy_for_proc_open = Arc::clone(policy); let resource_for_proc_open = Arc::clone(resource); let __sup = Arc::clone(ctx); - table.register(libc::SYS_openat, move |cx: &HandlerCtx| { + table.register(nr, move |cx: &HandlerCtx| { let notif = cx.notif; let sup = Arc::clone(&__sup); let notif_fd = cx.notif_fd; @@ -459,7 +512,9 @@ pub(crate) fn build_dispatch_table( } // ------------------------------------------------------------------ - // Hostname virtualization + // Hostname virtualization. The `/etc/hostname` shim is registered + // for every open-family syscall so dirfd-relative and legacy `open` + // spellings can't leak the host's real hostname. // ------------------------------------------------------------------ if let Some(ref hostname) = policy.virtual_hostname { let hostname_for_uname = hostname.clone(); @@ -472,38 +527,24 @@ pub(crate) fn build_dispatch_table( crate::procfs::handle_uname(¬if, &hostname, notif_fd) } }); - table.register(libc::SYS_openat, move |cx: &HandlerCtx| { - let notif = cx.notif; - let notif_fd = cx.notif_fd; + for nr in open_family_syscalls() { let hostname = hostname_for_open.clone(); - async move { - if let Some(action) = crate::procfs::handle_hostname_open(¬if, &hostname, notif_fd) { - action - } else { - NotifAction::Continue + table.register(nr, move |cx: &HandlerCtx| { + let notif = cx.notif; + let notif_fd = cx.notif_fd; + let hostname = hostname.clone(); + async move { + if let Some(action) = crate::procfs::handle_hostname_open(¬if, &hostname, notif_fd) { + action + } else { + NotifAction::Continue + } } - } - }); + }); + } } - // ------------------------------------------------------------------ - // /etc/hosts virtualization (for concrete-host entries in net_allow) - // ------------------------------------------------------------------ - if let Some(ref etc_hosts) = policy.virtual_etc_hosts { - let etc_hosts_for_open = etc_hosts.clone(); - table.register(libc::SYS_openat, move |cx: &HandlerCtx| { - let notif = cx.notif; - let notif_fd = cx.notif_fd; - let etc_hosts = etc_hosts_for_open.clone(); - async move { - if let Some(action) = crate::procfs::handle_etc_hosts_open(¬if, &etc_hosts, notif_fd) { - action - } else { - NotifAction::Continue - } - } - }); - } + // /etc/hosts is registered above the chroot block — see the comment there. // ------------------------------------------------------------------ // Deterministic directory listing @@ -1019,7 +1060,7 @@ mod handler_tests { deterministic_dirs: false, virtual_hostname: None, has_http_acl: false, - virtual_etc_hosts: None, + virtual_etc_hosts: String::new(), }), child_pidfd: None, notif_fd: -1, diff --git a/crates/sandlock-core/src/seccomp/notif.rs b/crates/sandlock-core/src/seccomp/notif.rs index 9db0b1c..cde8604 100644 --- a/crates/sandlock-core/src/seccomp/notif.rs +++ b/crates/sandlock-core/src/seccomp/notif.rs @@ -237,10 +237,11 @@ pub struct NotifPolicy { pub deterministic_dirs: bool, pub virtual_hostname: Option, pub has_http_acl: bool, - /// Synthetic `/etc/hosts` content for `net_allow` host-rule virtualization. - /// When set, `openat("/etc/hosts")` returns a memfd with this content - /// so sandboxed processes can resolve allowed hostnames without DNS. - pub virtual_etc_hosts: Option, + /// Synthetic `/etc/hosts` served to the sandbox. Always populated: + /// `openat("/etc/hosts")` returns a memfd with this content so the + /// host's on-disk `/etc/hosts` never leaks in. The content is the + /// loopback base plus any concrete hostnames resolved from `net_allow`. + pub virtual_etc_hosts: String, } // ============================================================ diff --git a/crates/sandlock-core/tests/integration/test_chroot.rs b/crates/sandlock-core/tests/integration/test_chroot.rs index 62f58ba..7fda877 100644 --- a/crates/sandlock-core/tests/integration/test_chroot.rs +++ b/crates/sandlock-core/tests/integration/test_chroot.rs @@ -537,3 +537,92 @@ async fn test_fs_mount_read_write() { let _ = fs::remove_dir_all(&rootfs); let _ = fs::remove_dir_all(&work_dir); } + +/// When the chroot image ships its own `/etc/hosts`, the synthetic file +/// the sandbox sees should be seeded from the image's content (so any +/// private-registry / internal-service entries the image baked in +/// survive virtualization) — with loopback entries added only if the +/// image didn't already provide them. +#[tokio::test] +async fn test_chroot_etc_hosts_seeded_from_image() { + let rootfs = build_test_rootfs("etc-hosts-image-seed"); + // Put a hosts file in the image that has its own entry but is + // missing both loopback families on purpose. + fs::write( + rootfs.join("etc/hosts"), + "10.0.0.5 internal.registry.local\n", + ) + .unwrap(); + + let policy = minimal_exec_policy(&rootfs) + .fs_read("/etc") + .build() + .unwrap(); + + let result = policy.clone().with_name("test").run(&["rootfs-helper", "cat", "/etc/hosts"]).await; + match result { + Ok(r) => { + assert!( + r.success(), + "cat /etc/hosts should succeed inside chroot, stderr: {}", + r.stderr_str().unwrap_or("") + ); + let stdout = r.stdout_str().unwrap_or("").to_string(); + assert!( + stdout.contains("10.0.0.5 internal.registry.local"), + "image's /etc/hosts entry missing: {stdout}" + ); + assert!( + stdout.contains("127.0.0.1 localhost"), + "v4 loopback should be injected for the image: {stdout}" + ); + assert!( + stdout.contains("::1 localhost"), + "v6 loopback should be injected for the image: {stdout}" + ); + } + Err(e) => eprintln!("Chroot test skipped: {}", e), + } + + cleanup_rootfs(&rootfs); +} + +/// When the image already ships loopback entries, the synthesizer must +/// not duplicate them — the sandbox should see exactly the image's view +/// of localhost, not two copies of it. +#[tokio::test] +async fn test_chroot_etc_hosts_no_duplicate_loopback() { + let rootfs = build_test_rootfs("etc-hosts-no-dup"); + fs::write( + rootfs.join("etc/hosts"), + "127.0.0.1 localhost\n::1 localhost\n10.0.0.5 svc.local\n", + ) + .unwrap(); + + let policy = minimal_exec_policy(&rootfs) + .fs_read("/etc") + .build() + .unwrap(); + + let result = policy.clone().with_name("test").run(&["rootfs-helper", "cat", "/etc/hosts"]).await; + match result { + Ok(r) => { + assert!(r.success(), "cat /etc/hosts should succeed"); + let stdout = r.stdout_str().unwrap_or("").to_string(); + assert_eq!( + stdout.matches("127.0.0.1 localhost").count(), + 1, + "v4 loopback duplicated: {stdout}" + ); + assert_eq!( + stdout.matches("::1 localhost").count(), + 1, + "v6 loopback duplicated: {stdout}" + ); + assert!(stdout.contains("10.0.0.5 svc.local"), "image entry missing: {stdout}"); + } + Err(e) => eprintln!("Chroot test skipped: {}", e), + } + + cleanup_rootfs(&rootfs); +} diff --git a/crates/sandlock-core/tests/integration/test_determinism.rs b/crates/sandlock-core/tests/integration/test_determinism.rs index ae20c26..fb4b54b 100644 --- a/crates/sandlock-core/tests/integration/test_determinism.rs +++ b/crates/sandlock-core/tests/integration/test_determinism.rs @@ -222,3 +222,42 @@ async fn test_hostname_virtualization() { let stdout = String::from_utf8_lossy(result.stdout.as_deref().unwrap_or_default()); assert_eq!(stdout.trim(), "mybox", "Expected /etc/hostname 'mybox', got: {:?}", stdout.trim()); } + +/// The /etc/hostname shim used to do a literal `path == "/etc/hostname"` +/// match, so dirfd-relative and non-canonical spellings leaked the host's +/// real hostname. Exercise each bypass shape and assert the virtual +/// hostname comes back. +#[tokio::test] +async fn test_hostname_virtualization_resists_path_bypasses() { + let policy = Sandbox::builder() + .fs_read("/usr") + .fs_read("/lib") + .fs_read_if_exists("/lib64") + .fs_read("/bin") + .fs_read("/etc") + .build() + .unwrap(); + + let script = concat!( + "import os\n", + "results = {}\n", + "etcfd = os.open('/etc', os.O_DIRECTORY | os.O_RDONLY)\n", + "fd = os.open('hostname', os.O_RDONLY, dir_fd=etcfd)\n", + "results['dirfd'] = os.read(fd, 4096).decode().strip()\n", + "os.close(fd); os.close(etcfd)\n", + "results['dotdot'] = open('/etc/../etc/hostname').read().strip()\n", + "results['curdir'] = open('/etc/./hostname').read().strip()\n", + "results['slash2'] = open('//etc/hostname').read().strip()\n", + "print(results)\n", + ); + + let result = policy.clone().with_name("mybox").run(&["python3", "-c", script]).await.unwrap(); + let stdout = String::from_utf8_lossy(result.stdout.as_deref().unwrap_or_default()); + for label in ["dirfd", "dotdot", "curdir", "slash2"] { + let needle = format!("'{label}': 'mybox'"); + assert!( + stdout.contains(&needle), + "{label}: host /etc/hostname leaked. stdout: {stdout}" + ); + } +} diff --git a/crates/sandlock-core/tests/integration/test_netlink_virt.rs b/crates/sandlock-core/tests/integration/test_netlink_virt.rs index 0fb41fb..1cfed8b 100644 --- a/crates/sandlock-core/tests/integration/test_netlink_virt.rs +++ b/crates/sandlock-core/tests/integration/test_netlink_virt.rs @@ -66,15 +66,25 @@ async fn loopback_bind_succeeds() { /// Exercises `RTM_GETADDR` via glibc's `__check_pf`. With `AI_ADDRCONFIG`, /// glibc opens a NETLINK_ROUTE socket and dumps addresses to decide which /// families (v4/v6) the host supports. Our synthesized dump advertises -/// both 127.0.0.1 and ::1, so getaddrinfo must return entries for both -/// families for `localhost`. +/// both 127.0.0.1 and ::1, so AI_ADDRCONFIG must accept both families. +/// +/// We use `getaddrinfo(None, port, AI_PASSIVE | AI_ADDRCONFIG)` instead of +/// looking up "localhost". glibc fast-paths "localhost" through a +/// hard-coded check that uses `__check_pf` directly and filters out v6 +/// when no non-loopback IPv6 address is configured (loopback-only is +/// exactly what our sandbox shows), so a localhost lookup never returns +/// v6 inside the sandbox regardless of `/etc/hosts`. AI_PASSIVE with a +/// null node name returns the wildcard address for every family that +/// `__check_pf` says is configured, so it exercises the netlink dump +/// path directly. #[tokio::test] async fn getaddrinfo_ai_addrconfig_returns_v4_and_v6() { let out = temp_out("getaddrinfo"); let script = format!(concat!( "import socket\n", "fams = sorted({{i[0].name for i in socket.getaddrinfo(", - "'localhost', 443, type=socket.SOCK_STREAM, flags=socket.AI_ADDRCONFIG)}})\n", + "None, 443, type=socket.SOCK_STREAM, ", + "flags=socket.AI_PASSIVE | socket.AI_ADDRCONFIG)}})\n", "open('{out}', 'w').write(','.join(fams))\n", ), out = out.display()); @@ -87,7 +97,7 @@ async fn getaddrinfo_ai_addrconfig_returns_v4_and_v6() { assert_eq!( contents.trim(), "AF_INET,AF_INET6", - "AI_ADDRCONFIG should return both families for localhost, got: {}", + "AI_ADDRCONFIG should consider both families configured, got: {}", contents ); assert!(result.success()); @@ -329,3 +339,74 @@ async fn non_route_netlink_still_blocked() { ); assert!(result.success()); } + +/// `/etc/hosts` is always virtualized, independent of whatever the host's +/// on-disk file says: the sandbox sees a fixed loopback-only view. The +/// loopback base (`127.0.0.1 localhost` / `::1 localhost`) is always +/// present, and concrete-host entries from `net_allow` get appended to +/// the same synthetic file. +#[tokio::test] +async fn etc_hosts_virtualized_with_loopback_base() { + let out = temp_out("etc-hosts"); + let script = format!( + "open('{out}', 'w').write(open('/etc/hosts').read())\n", + out = out.display(), + ); + + let policy = base_policy().build().unwrap(); + let result = policy.clone().with_name("test").run_interactive(&["python3", "-c", &script]) + .await.unwrap(); + + let contents = std::fs::read_to_string(&out).unwrap_or_default(); + let _ = std::fs::remove_file(&out); + // With no `net_allow` rules the sandbox sees exactly the loopback + // base; any deviation means the on-disk `/etc/hosts` leaked through. + assert_eq!( + contents, + "127.0.0.1 localhost\n::1 localhost\n", + "virtual /etc/hosts content mismatch" + ); + assert!(result.success()); +} + +/// The literal-path match used to be the only check, so any spelling +/// other than `"/etc/hosts"` reached the host's on-disk file. Hit each +/// known bypass and assert we see the synthetic content instead. +#[tokio::test] +async fn etc_hosts_virtualization_resists_path_bypasses() { + let out = temp_out("etc-hosts-bypass"); + // Python's builtin `open` goes through libc → `openat(AT_FDCWD, ...)`, + // so we cover the dirfd-relative case via os.open + os.openat-style + // file-descriptor reuse, and the non-canonical case directly. + let script = format!(concat!( + "import os\n", + "results = {{}}\n", + // 1. Dirfd-relative: open /etc, then read 'hosts' relative to it. + "etcfd = os.open('/etc', os.O_DIRECTORY | os.O_RDONLY)\n", + "fd = os.open('hosts', os.O_RDONLY, dir_fd=etcfd)\n", + "results['dirfd_relative'] = os.read(fd, 4096).decode()\n", + "os.close(fd); os.close(etcfd)\n", + // 2. Non-canonical absolute via redundant components. + "results['dotdot'] = open('/etc/../etc/hosts').read()\n", + "results['slash2'] = open('//etc/hosts').read()\n", + "results['curdir'] = open('/etc/./hosts').read()\n", + "open('{out}', 'w').write(repr(results))\n", + ), out = out.display()); + + let policy = base_policy().build().unwrap(); + let result = policy.clone().with_name("test").run_interactive(&["python3", "-c", &script]) + .await.unwrap(); + + let contents = std::fs::read_to_string(&out).unwrap_or_default(); + let _ = std::fs::remove_file(&out); + // Every spelling must hit the synthetic memfd, not the host file. + let expected = "127.0.0.1 localhost\\n::1 localhost\\n"; + for label in ["dirfd_relative", "dotdot", "slash2", "curdir"] { + let needle = format!("'{label}': '{expected}'"); + assert!( + contents.contains(&needle), + "{label}: host /etc/hosts leaked. got: {contents}" + ); + } + assert!(result.success()); +} diff --git a/crates/sandlock-core/tests/integration/test_procfs.rs b/crates/sandlock-core/tests/integration/test_procfs.rs index e0e81e8..1997777 100644 --- a/crates/sandlock-core/tests/integration/test_procfs.rs +++ b/crates/sandlock-core/tests/integration/test_procfs.rs @@ -72,6 +72,101 @@ async fn test_sensitive_proc_blocked() { assert!(!result.success(), "/proc/kcore should be denied"); } +/// The sensitive-path deny used to do a literal `path == "/proc/kcore"` +/// (and `starts_with("/proc/kcore/")`) match, which any non-canonical or +/// dirfd-relative spelling sidestepped. Exercise each known bypass shape +/// and assert the deny still fires. +#[tokio::test] +async fn test_sensitive_proc_resists_bypasses() { + let policy = Sandbox::builder() + .fs_read("/usr") + .fs_read("/lib") + .fs_read_if_exists("/lib64") + .fs_read("/bin") + .fs_read("/etc") + .fs_read("/proc") + .num_cpus(1) + .build() + .unwrap(); + + // EACCES (errno 13) is what the handler returns for sensitive paths. + // Each branch prints OK if the open was denied, FAIL otherwise. + let script = concat!( + "import os, errno\n", + "results = []\n", + "def must_deny(label, fn):\n", + " try:\n", + " fd = fn()\n", + " os.close(fd)\n", + " results.append(f'{label}:LEAKED')\n", + " except OSError as e:\n", + " results.append(f'{label}:DENIED' if e.errno == errno.EACCES else f'{label}:errno={e.errno}')\n", + // 1. dirfd-relative: open(/proc), then open 'kcore' relative to it + "procfd = os.open('/proc', os.O_DIRECTORY | os.O_RDONLY)\n", + "must_deny('dirfd', lambda: os.open('kcore', os.O_RDONLY, dir_fd=procfd))\n", + "os.close(procfd)\n", + // 2. non-canonical absolutes + "must_deny('dotdot', lambda: os.open('/proc/../proc/kcore', os.O_RDONLY))\n", + "must_deny('curdir', lambda: os.open('/proc/./kcore', os.O_RDONLY))\n", + "must_deny('slash2', lambda: os.open('//proc/kcore', os.O_RDONLY))\n", + "print('|'.join(results))\n", + ); + + let result = policy.clone().with_name("test").run(&["python3", "-c", script]).await.unwrap(); + let stdout = String::from_utf8_lossy(result.stdout.as_deref().unwrap_or_default()); + for label in ["dirfd", "dotdot", "curdir", "slash2"] { + let needle = format!("{label}:DENIED"); + assert!( + stdout.contains(&needle), + "{label}: /proc/kcore leaked via this spelling. stdout: {stdout}" + ); + } +} + +/// The /proc/cpuinfo virtualization used to do a literal +/// `path == "/proc/cpuinfo"` match, so non-canonical and dirfd-relative +/// spellings fell through to the host's real cpuinfo and leaked the host's +/// real CPU count. +#[tokio::test] +async fn test_proc_virt_resists_bypasses() { + let policy = Sandbox::builder() + .fs_read("/usr") + .fs_read("/lib") + .fs_read_if_exists("/lib64") + .fs_read("/bin") + .fs_read("/etc") + .fs_read("/proc") + .num_cpus(2) + .build() + .unwrap(); + + // Every spelling must see exactly 2 `^processor` lines, matching the + // synthetic cpuinfo. A leak to the host file would show this host's + // real CPU count (almost certainly != 2). + let script = concat!( + "import os\n", + "results = {}\n", + "procfd = os.open('/proc', os.O_DIRECTORY | os.O_RDONLY)\n", + "fd = os.open('cpuinfo', os.O_RDONLY, dir_fd=procfd)\n", + "results['dirfd'] = os.read(fd, 4096).decode().count('processor\\t')\n", + "os.close(fd); os.close(procfd)\n", + "results['dotdot'] = open('/proc/../proc/cpuinfo').read().count('processor\\t')\n", + "results['curdir'] = open('/proc/./cpuinfo').read().count('processor\\t')\n", + "results['slash2'] = open('//proc/cpuinfo').read().count('processor\\t')\n", + "print(results)\n", + ); + + let result = policy.clone().with_name("test").run(&["python3", "-c", script]).await.unwrap(); + let stdout = String::from_utf8_lossy(result.stdout.as_deref().unwrap_or_default()); + for label in ["dirfd", "dotdot", "curdir", "slash2"] { + let needle = format!("'{label}': 2"); + assert!( + stdout.contains(&needle), + "{label}: host cpuinfo leaked (expected 2 processors, virtualized). stdout: {stdout}" + ); + } +} + /// Test basic sandbox still works without /proc virtualization. #[tokio::test] async fn test_no_proc_virt_still_works() {