From 327844507d10c9bf7c332937b79b741998e4231d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 26 May 2026 19:54:58 -0700 Subject: [PATCH 1/4] network: always virtualize /etc/hosts so it's independent of the host's file resolve_net_allow used to emit `etc_hosts: Some(...)` only when at least one `net_allow` rule named a concrete hostname; otherwise the seccomp openat shim was not registered and `openat("/etc/hosts")` fell through to the host's on-disk file. That broke the "loopback-only view" the netlink and procfs virtualization already promise, and made sandbox behavior depend on whichever distro's `/etc/hosts` happened to be installed (Ubuntu 20.04 parks `::1` under `ip6-localhost`, not `localhost`, so AI_ADDRCONFIG localhost lookups silently dropped v6). Make `etc_hosts` a plain `String` that always carries the loopback base (`127.0.0.1 localhost` / `::1 localhost`), with `net_allow` host entries appended on top, and register the openat shim unconditionally. Add a direct test that reads `/etc/hosts` from inside the sandbox and asserts the exact synthetic content. Signed-off-by: Cong Wang --- crates/sandlock-core/src/network.rs | 40 +++++++++------- crates/sandlock-core/src/procfs.rs | 8 ++-- crates/sandlock-core/src/resource.rs | 2 +- crates/sandlock-core/src/seccomp/dispatch.rs | 10 ++-- crates/sandlock-core/src/seccomp/notif.rs | 9 ++-- .../tests/integration/test_netlink_virt.rs | 47 +++++++++++++++++-- 6 files changed, 81 insertions(+), 35 deletions(-) diff --git a/crates/sandlock-core/src/network.rs b/crates/sandlock-core/src/network.rs index c0bdaa6..76bb669 100644 --- a/crates/sandlock-core/src/network.rs +++ b/crates/sandlock-core/src/network.rs @@ -1037,9 +1037,11 @@ pub struct ResolvedNetAllowSet { pub tcp: ResolvedNetAllow, pub udp: ResolvedNetAllow, pub icmp: ResolvedNetAllow, - /// Synthetic `/etc/hosts` content combining every concrete host - /// across all protocols. `None` when no concrete hostnames appear. - pub etc_hosts: Option, + /// Synthetic `/etc/hosts` content. Always includes the loopback base + /// (`127.0.0.1 localhost` / `::1 localhost`) plus an entry per concrete + /// hostname appearing in any rule; the host's on-disk `/etc/hosts` never + /// leaks into the sandbox. + pub etc_hosts: String, } /// Resolve `--net-allow` rules into per-protocol runtime allowlists. @@ -1053,10 +1055,11 @@ pub struct ResolvedNetAllowSet { pub async fn resolve_net_allow( rules: &[NetAllow], ) -> io::Result { - // Single shared etc_hosts for all protocols. Every concrete host - // (regardless of protocol) ends up resolvable in the sandbox. + // Single shared etc_hosts for all protocols. Always present so the + // sandbox sees a fixed loopback-only view independent of the host's + // on-disk `/etc/hosts`; concrete-hostname rules append their resolved + // entries on top. let mut etc_hosts = String::from("127.0.0.1 localhost\n::1 localhost\n"); - let mut has_concrete_host = false; let per_proto = |target: Protocol| async move { let mut per_ip: HashMap> = HashMap::new(); @@ -1064,7 +1067,6 @@ pub async fn resolve_net_allow( let mut any_ip_ports: HashSet = HashSet::new(); let mut any_ip_all_ports = false; let mut local_etc_hosts = String::new(); - let mut local_has_concrete = false; for rule in rules.iter().filter(|r| r.protocol == target) { match &rule.host { @@ -1080,7 +1082,6 @@ pub async fn resolve_net_allow( } } Some(host) => { - local_has_concrete = true; let addr = format!("{}:0", host); let resolved = tokio::net::lookup_host(addr.as_str()).await.map_err(|e| { io::Error::new( @@ -1113,24 +1114,22 @@ pub async fn resolve_net_allow( any_ip_all_ports, }, local_etc_hosts, - local_has_concrete, )) }; - let (tcp, tcp_eh, tcp_concrete) = per_proto(Protocol::Tcp).await?; - let (udp, udp_eh, udp_concrete) = per_proto(Protocol::Udp).await?; - let (icmp, icmp_eh, icmp_concrete) = per_proto(Protocol::Icmp).await?; + let (tcp, tcp_eh) = per_proto(Protocol::Tcp).await?; + let (udp, udp_eh) = per_proto(Protocol::Udp).await?; + let (icmp, icmp_eh) = per_proto(Protocol::Icmp).await?; for chunk in [tcp_eh, udp_eh, icmp_eh] { etc_hosts.push_str(&chunk); } - has_concrete_host |= tcp_concrete || udp_concrete || icmp_concrete; Ok(ResolvedNetAllowSet { tcp, udp, icmp, - etc_hosts: if has_concrete_host { Some(etc_hosts) } else { None }, + etc_hosts, }) } @@ -1318,7 +1317,10 @@ mod tests { assert!(resolved.tcp.any_ip_ports.is_empty()); assert!(resolved.udp.per_ip.is_empty()); assert!(resolved.icmp.per_ip.is_empty()); - assert!(resolved.etc_hosts.is_none()); + // Loopback base is always emitted, even without rules, so the + // sandbox's `/etc/hosts` never falls through to the host's copy. + assert!(resolved.etc_hosts.contains("127.0.0.1 localhost")); + assert!(resolved.etc_hosts.contains("::1 localhost")); } #[tokio::test] @@ -1339,7 +1341,8 @@ mod tests { } assert!(resolved.udp.per_ip.is_empty()); assert!(resolved.icmp.per_ip.is_empty()); - assert!(resolved.etc_hosts.as_deref().unwrap_or("").contains("localhost")); + // Concrete-host entry (` localhost`) appended on top of the base. + assert!(resolved.etc_hosts.contains("127.0.0.1 localhost")); } #[tokio::test] @@ -1354,7 +1357,8 @@ mod tests { assert!(resolved.tcp.per_ip.is_empty()); assert!(resolved.tcp.any_ip_ports.contains(&8080)); assert!(!resolved.tcp.any_ip_all_ports); - assert!(resolved.etc_hosts.is_none()); + // Even an any-IP rule still gets the loopback base content. + assert!(resolved.etc_hosts.contains("127.0.0.1 localhost")); } #[tokio::test] @@ -1394,7 +1398,7 @@ mod tests { for ip in resolved.tcp.per_ip_all_ports.iter() { assert!(resolved.tcp.per_ip.contains_key(ip)); } - assert!(resolved.etc_hosts.is_some()); + assert!(resolved.etc_hosts.contains("localhost")); } #[tokio::test] diff --git a/crates/sandlock-core/src/procfs.rs b/crates/sandlock-core/src/procfs.rs index 9907aa5..70b48fa 100644 --- a/crates/sandlock-core/src/procfs.rs +++ b/crates/sandlock-core/src/procfs.rs @@ -605,10 +605,10 @@ pub(crate) fn handle_hostname_open( /// Intercept `openat("/etc/hosts")` and return a memfd with virtual content. /// -/// When `net_allow` contains concrete hostnames, the supervisor pre-resolves -/// them and builds a synthetic `/etc/hosts`. This lets sandboxed processes -/// resolve allowed hostnames via glibc's `files` NSS backend without contacting -/// a DNS server. +/// Every sandbox gets a fixed loopback view (`127.0.0.1 localhost` / +/// `::1 localhost`) plus any concrete hostnames pre-resolved from +/// `net_allow`, so the host's on-disk `/etc/hosts` never leaks in and +/// glibc's `files` NSS backend resolves allowed hostnames without DNS. pub(crate) fn handle_etc_hosts_open( notif: &SeccompNotif, etc_hosts_content: &str, diff --git a/crates/sandlock-core/src/resource.rs b/crates/sandlock-core/src/resource.rs index d5ba0de..52a635c 100644 --- a/crates/sandlock-core/src/resource.rs +++ b/crates/sandlock-core/src/resource.rs @@ -653,7 +653,7 @@ mod tests { deterministic_dirs: false, virtual_hostname: None, has_http_acl: false, - virtual_etc_hosts: None, + virtual_etc_hosts: String::new(), } } diff --git a/crates/sandlock-core/src/seccomp/dispatch.rs b/crates/sandlock-core/src/seccomp/dispatch.rs index adb3c02..e288ea0 100644 --- a/crates/sandlock-core/src/seccomp/dispatch.rs +++ b/crates/sandlock-core/src/seccomp/dispatch.rs @@ -487,10 +487,12 @@ pub(crate) fn build_dispatch_table( } // ------------------------------------------------------------------ - // /etc/hosts virtualization (for concrete-host entries in net_allow) + // /etc/hosts virtualization: always on. The synthetic file contains + // the loopback base plus any concrete hostnames resolved from + // `net_allow`, so the host's on-disk `/etc/hosts` never leaks in. // ------------------------------------------------------------------ - if let Some(ref etc_hosts) = policy.virtual_etc_hosts { - let etc_hosts_for_open = etc_hosts.clone(); + { + let etc_hosts_for_open = policy.virtual_etc_hosts.clone(); table.register(libc::SYS_openat, move |cx: &HandlerCtx| { let notif = cx.notif; let notif_fd = cx.notif_fd; @@ -1019,7 +1021,7 @@ mod handler_tests { deterministic_dirs: false, virtual_hostname: None, has_http_acl: false, - virtual_etc_hosts: None, + virtual_etc_hosts: String::new(), }), child_pidfd: None, notif_fd: -1, diff --git a/crates/sandlock-core/src/seccomp/notif.rs b/crates/sandlock-core/src/seccomp/notif.rs index 9db0b1c..cde8604 100644 --- a/crates/sandlock-core/src/seccomp/notif.rs +++ b/crates/sandlock-core/src/seccomp/notif.rs @@ -237,10 +237,11 @@ pub struct NotifPolicy { pub deterministic_dirs: bool, pub virtual_hostname: Option, pub has_http_acl: bool, - /// Synthetic `/etc/hosts` content for `net_allow` host-rule virtualization. - /// When set, `openat("/etc/hosts")` returns a memfd with this content - /// so sandboxed processes can resolve allowed hostnames without DNS. - pub virtual_etc_hosts: Option, + /// Synthetic `/etc/hosts` served to the sandbox. Always populated: + /// `openat("/etc/hosts")` returns a memfd with this content so the + /// host's on-disk `/etc/hosts` never leaks in. The content is the + /// loopback base plus any concrete hostnames resolved from `net_allow`. + pub virtual_etc_hosts: String, } // ============================================================ diff --git a/crates/sandlock-core/tests/integration/test_netlink_virt.rs b/crates/sandlock-core/tests/integration/test_netlink_virt.rs index 0fb41fb..d223632 100644 --- a/crates/sandlock-core/tests/integration/test_netlink_virt.rs +++ b/crates/sandlock-core/tests/integration/test_netlink_virt.rs @@ -66,15 +66,25 @@ async fn loopback_bind_succeeds() { /// Exercises `RTM_GETADDR` via glibc's `__check_pf`. With `AI_ADDRCONFIG`, /// glibc opens a NETLINK_ROUTE socket and dumps addresses to decide which /// families (v4/v6) the host supports. Our synthesized dump advertises -/// both 127.0.0.1 and ::1, so getaddrinfo must return entries for both -/// families for `localhost`. +/// both 127.0.0.1 and ::1, so AI_ADDRCONFIG must accept both families. +/// +/// We use `getaddrinfo(None, port, AI_PASSIVE | AI_ADDRCONFIG)` instead of +/// looking up "localhost". glibc fast-paths "localhost" through a +/// hard-coded check that uses `__check_pf` directly and filters out v6 +/// when no non-loopback IPv6 address is configured (loopback-only is +/// exactly what our sandbox shows), so a localhost lookup never returns +/// v6 inside the sandbox regardless of `/etc/hosts`. AI_PASSIVE with a +/// null node name returns the wildcard address for every family that +/// `__check_pf` says is configured, so it exercises the netlink dump +/// path directly. #[tokio::test] async fn getaddrinfo_ai_addrconfig_returns_v4_and_v6() { let out = temp_out("getaddrinfo"); let script = format!(concat!( "import socket\n", "fams = sorted({{i[0].name for i in socket.getaddrinfo(", - "'localhost', 443, type=socket.SOCK_STREAM, flags=socket.AI_ADDRCONFIG)}})\n", + "None, 443, type=socket.SOCK_STREAM, ", + "flags=socket.AI_PASSIVE | socket.AI_ADDRCONFIG)}})\n", "open('{out}', 'w').write(','.join(fams))\n", ), out = out.display()); @@ -87,7 +97,7 @@ async fn getaddrinfo_ai_addrconfig_returns_v4_and_v6() { assert_eq!( contents.trim(), "AF_INET,AF_INET6", - "AI_ADDRCONFIG should return both families for localhost, got: {}", + "AI_ADDRCONFIG should consider both families configured, got: {}", contents ); assert!(result.success()); @@ -329,3 +339,32 @@ async fn non_route_netlink_still_blocked() { ); assert!(result.success()); } + +/// `/etc/hosts` is always virtualized, independent of whatever the host's +/// on-disk file says: the sandbox sees a fixed loopback-only view. The +/// loopback base (`127.0.0.1 localhost` / `::1 localhost`) is always +/// present, and concrete-host entries from `net_allow` get appended to +/// the same synthetic file. +#[tokio::test] +async fn etc_hosts_virtualized_with_loopback_base() { + let out = temp_out("etc-hosts"); + let script = format!( + "open('{out}', 'w').write(open('/etc/hosts').read())\n", + out = out.display(), + ); + + let policy = base_policy().build().unwrap(); + let result = policy.clone().with_name("test").run_interactive(&["python3", "-c", &script]) + .await.unwrap(); + + let contents = std::fs::read_to_string(&out).unwrap_or_default(); + let _ = std::fs::remove_file(&out); + // With no `net_allow` rules the sandbox sees exactly the loopback + // base; any deviation means the on-disk `/etc/hosts` leaked through. + assert_eq!( + contents, + "127.0.0.1 localhost\n::1 localhost\n", + "virtual /etc/hosts content mismatch" + ); + assert!(result.success()); +} From 0fc09b48553ca73390588898997657278d36bd3e Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 26 May 2026 22:19:18 -0700 Subject: [PATCH 2/4] network: catch all spellings of /etc/hosts open, not just the literal The handler installed in 3278445 only matched `openat(_, "/etc/hosts", ...)`, leaving four bypasses that read the host's real on-disk file and undid the commit's no-leak guarantee: legacy `open(path, ...)` (not registered at all), `openat2(...)` (a distinct syscall number used by some Go and hand-rolled libcs), dirfd-relative spellings like `openat(open("/etc"), "hosts", ...)`, and non-canonical absolutes like `/etc/../etc/hosts`, `//etc/hosts`, or `/etc/./hosts`. Register the shim for `open`, `openat`, and `openat2`, and rewrite `handle_etc_hosts_open` to dispatch on `notif.data.nr` so each ABI hits the right argument slots. Resolve the (dirfd, pathname) pair to an absolute path via `/proc//cwd` or `/proc//fd/` and lexically normalize it (collapse `.`, `..`, redundant `/`) before comparing to `/etc/hosts`. The BPF notif list now carries `open` and `openat2` unconditionally so callers picking those syscalls actually generate a notification instead of slipping through ALLOW. Add a regression test that exercises each bypass shape (dirfd-relative + three non-canonical spellings) and asserts the synthetic content comes back every time. Signed-off-by: Cong Wang --- crates/sandlock-core/src/arch.rs | 2 + crates/sandlock-core/src/context.rs | 12 ++- crates/sandlock-core/src/procfs.rs | 81 ++++++++++++++++++- crates/sandlock-core/src/seccomp/dispatch.rs | 37 ++++++--- .../tests/integration/test_netlink_virt.rs | 42 ++++++++++ 5 files changed, 157 insertions(+), 17 deletions(-) diff --git a/crates/sandlock-core/src/arch.rs b/crates/sandlock-core/src/arch.rs index 89969e4..c18492e 100644 --- a/crates/sandlock-core/src/arch.rs +++ b/crates/sandlock-core/src/arch.rs @@ -8,6 +8,7 @@ mod imp { pub const SYS_MEMFD_CREATE: i64 = 319; pub const SYS_PIDFD_OPEN: i64 = 434; pub const SYS_PIDFD_GETFD: i64 = 438; + pub const SYS_OPENAT2: i64 = 437; pub const SYS_OPEN: Option = Some(libc::SYS_open); pub const SYS_STAT: Option = Some(libc::SYS_stat); @@ -51,6 +52,7 @@ mod imp { pub const SYS_MEMFD_CREATE: i64 = 279; pub const SYS_PIDFD_OPEN: i64 = 434; pub const SYS_PIDFD_GETFD: i64 = 438; + pub const SYS_OPENAT2: i64 = 437; pub const SYS_OPEN: Option = None; pub const SYS_STAT: Option = None; diff --git a/crates/sandlock-core/src/context.rs b/crates/sandlock-core/src/context.rs index c03282d..a623fa4 100644 --- a/crates/sandlock-core/src/context.rs +++ b/crates/sandlock-core/src/context.rs @@ -308,8 +308,18 @@ pub fn notif_syscalls(policy: &Sandbox, sandbox_name: Option<&str>) -> Vec nrs.push(libc::SYS_openat as u32); } - // /proc virtualization (always on: PID filtering, sensitive path blocking) + // /proc virtualization + /etc/hosts virtualization (always on). + // + // `openat` carries the simple `(AT_FDCWD, "/proc/...")` and + // `(AT_FDCWD, "/etc/hosts")` spellings; `openat2` is the same shape + // on newer libcs; legacy `open(path, ...)` is the same path without a + // dirfd. The handlers normalize all three into a single absolute path + // check, so we have to put every variant on the notif list — otherwise + // a caller that picks `open` or `openat2` slips past virtualization + // and reads the real on-disk file. nrs.push(libc::SYS_openat as u32); + nrs.push(arch::SYS_OPENAT2 as u32); + arch::push_optional_syscall(&mut nrs, arch::SYS_OPEN); nrs.push(libc::SYS_getdents64 as u32); arch::push_optional_syscall(&mut nrs, arch::SYS_GETDENTS); diff --git a/crates/sandlock-core/src/procfs.rs b/crates/sandlock-core/src/procfs.rs index 70b48fa..d7dd568 100644 --- a/crates/sandlock-core/src/procfs.rs +++ b/crates/sandlock-core/src/procfs.rs @@ -603,27 +603,100 @@ pub(crate) fn handle_hostname_open( Some(inject_memfd(content.as_bytes())) } -/// Intercept `openat("/etc/hosts")` and return a memfd with virtual content. +/// Intercept any `open`/`openat`/`openat2` of `/etc/hosts` and return a memfd +/// with virtual content. /// /// Every sandbox gets a fixed loopback view (`127.0.0.1 localhost` / /// `::1 localhost`) plus any concrete hostnames pre-resolved from /// `net_allow`, so the host's on-disk `/etc/hosts` never leaks in and /// glibc's `files` NSS backend resolves allowed hostnames without DNS. +/// +/// Coverage notes (the simple literal-path match used to miss these): +/// - Legacy `open(path, ...)`: dispatched by syscall number; no `dirfd`. +/// - `openat2(dirfd, path, struct open_how *, size)`: dispatched the same +/// way as `openat`; the trailing `open_how` argument is irrelevant for +/// the file-name match. +/// - Relative paths via `dirfd` (e.g. `openat(open("/etc"), "hosts")`): +/// the `dirfd` is resolved via `/proc//fd/`. +/// - Non-canonical absolutes (`/etc/../etc/hosts`, `//etc/hosts`, +/// `/etc/./hosts`): collapsed by lexical normalization before comparison. pub(crate) fn handle_etc_hosts_open( notif: &SeccompNotif, etc_hosts_content: &str, notif_fd: RawFd, ) -> Option { - let path_ptr = notif.data.args[1]; - let path = read_path(notif, path_ptr, notif_fd)?; + let nr = notif.data.nr as i64; + let (dirfd, path_ptr): (i64, u64) = if Some(nr) == crate::arch::SYS_OPEN { + // open(path, flags, mode) — no dirfd, behaves as AT_FDCWD. + (libc::AT_FDCWD as i64, notif.data.args[0]) + } else if nr == libc::SYS_openat || nr == crate::arch::SYS_OPENAT2 { + // openat(dirfd, path, ...) and openat2(dirfd, path, ...) share + // the same first two argument slots. + (notif.data.args[0] as i64, notif.data.args[1]) + } else { + return None; + }; - if path != "/etc/hosts" { + let path = read_path(notif, path_ptr, notif_fd)?; + let resolved = resolve_to_normalized_absolute(notif.pid, dirfd, &path)?; + if resolved != std::path::Path::new("/etc/hosts") { return None; } Some(inject_memfd(etc_hosts_content.as_bytes())) } +/// Resolve `(pid, dirfd, path)` to a lexically-normalized absolute path. +/// +/// - Absolute `path`: used as-is. +/// - Relative `path` with `dirfd == AT_FDCWD`: prefixed with the child's +/// cwd from `/proc//cwd`. +/// - Relative `path` with explicit `dirfd`: prefixed with the symlink +/// target of `/proc//fd/` (the host kernel's view of the +/// directory the dirfd points to). +/// +/// Returns `None` if the dirfd cannot be resolved, or if the path walks +/// above the root via `..`. +fn resolve_to_normalized_absolute( + pid: u32, + dirfd: i64, + path: &str, +) -> Option { + use std::path::{Component, Path, PathBuf}; + + let joined: PathBuf = if Path::new(path).is_absolute() { + PathBuf::from(path) + } else { + let base = if dirfd as i32 == libc::AT_FDCWD { + std::fs::read_link(format!("/proc/{}/cwd", pid)).ok()? + } else { + std::fs::read_link(format!("/proc/{}/fd/{}", pid, dirfd as i32)).ok()? + }; + base.join(path) + }; + + let mut out = PathBuf::new(); + for comp in joined.components() { + match comp { + Component::Prefix(p) => out.push(p.as_os_str()), + Component::RootDir => out.push("/"), + Component::CurDir => {} + Component::ParentDir => { + // pop the last regular component; refuse to walk above + // the root (out becomes empty after popping "/"). + if !out.pop() { + return None; + } + if out.as_os_str().is_empty() { + return None; + } + } + Component::Normal(c) => out.push(c), + } + } + Some(out) +} + // ============================================================ // Deterministic directory listing // ============================================================ diff --git a/crates/sandlock-core/src/seccomp/dispatch.rs b/crates/sandlock-core/src/seccomp/dispatch.rs index e288ea0..acf1249 100644 --- a/crates/sandlock-core/src/seccomp/dispatch.rs +++ b/crates/sandlock-core/src/seccomp/dispatch.rs @@ -490,21 +490,34 @@ pub(crate) fn build_dispatch_table( // /etc/hosts virtualization: always on. The synthetic file contains // the loopback base plus any concrete hostnames resolved from // `net_allow`, so the host's on-disk `/etc/hosts` never leaks in. + // + // Registered for `open`, `openat`, and `openat2` so the simple literal + // spelling, dirfd-relative spelling, and openat2 callers (some Go and + // hand-rolled runtimes) all funnel through the same path normalization + // in `handle_etc_hosts_open`. The handler itself reads + // `notif.data.nr` to extract the right argument slots. // ------------------------------------------------------------------ { - let etc_hosts_for_open = policy.virtual_etc_hosts.clone(); - table.register(libc::SYS_openat, move |cx: &HandlerCtx| { - let notif = cx.notif; - let notif_fd = cx.notif_fd; - let etc_hosts = etc_hosts_for_open.clone(); - async move { - if let Some(action) = crate::procfs::handle_etc_hosts_open(¬if, &etc_hosts, notif_fd) { - action - } else { - NotifAction::Continue + let etc_hosts = policy.virtual_etc_hosts.clone(); + let mut open_syscalls: Vec = vec![libc::SYS_openat, arch::SYS_OPENAT2]; + if let Some(legacy_open) = arch::SYS_OPEN { + open_syscalls.push(legacy_open); + } + for nr in open_syscalls { + let etc_hosts = etc_hosts.clone(); + table.register(nr, move |cx: &HandlerCtx| { + let notif = cx.notif; + let notif_fd = cx.notif_fd; + let etc_hosts = etc_hosts.clone(); + async move { + if let Some(action) = crate::procfs::handle_etc_hosts_open(¬if, &etc_hosts, notif_fd) { + action + } else { + NotifAction::Continue + } } - } - }); + }); + } } // ------------------------------------------------------------------ diff --git a/crates/sandlock-core/tests/integration/test_netlink_virt.rs b/crates/sandlock-core/tests/integration/test_netlink_virt.rs index d223632..1cfed8b 100644 --- a/crates/sandlock-core/tests/integration/test_netlink_virt.rs +++ b/crates/sandlock-core/tests/integration/test_netlink_virt.rs @@ -368,3 +368,45 @@ async fn etc_hosts_virtualized_with_loopback_base() { ); assert!(result.success()); } + +/// The literal-path match used to be the only check, so any spelling +/// other than `"/etc/hosts"` reached the host's on-disk file. Hit each +/// known bypass and assert we see the synthetic content instead. +#[tokio::test] +async fn etc_hosts_virtualization_resists_path_bypasses() { + let out = temp_out("etc-hosts-bypass"); + // Python's builtin `open` goes through libc → `openat(AT_FDCWD, ...)`, + // so we cover the dirfd-relative case via os.open + os.openat-style + // file-descriptor reuse, and the non-canonical case directly. + let script = format!(concat!( + "import os\n", + "results = {{}}\n", + // 1. Dirfd-relative: open /etc, then read 'hosts' relative to it. + "etcfd = os.open('/etc', os.O_DIRECTORY | os.O_RDONLY)\n", + "fd = os.open('hosts', os.O_RDONLY, dir_fd=etcfd)\n", + "results['dirfd_relative'] = os.read(fd, 4096).decode()\n", + "os.close(fd); os.close(etcfd)\n", + // 2. Non-canonical absolute via redundant components. + "results['dotdot'] = open('/etc/../etc/hosts').read()\n", + "results['slash2'] = open('//etc/hosts').read()\n", + "results['curdir'] = open('/etc/./hosts').read()\n", + "open('{out}', 'w').write(repr(results))\n", + ), out = out.display()); + + let policy = base_policy().build().unwrap(); + let result = policy.clone().with_name("test").run_interactive(&["python3", "-c", &script]) + .await.unwrap(); + + let contents = std::fs::read_to_string(&out).unwrap_or_default(); + let _ = std::fs::remove_file(&out); + // Every spelling must hit the synthetic memfd, not the host file. + let expected = "127.0.0.1 localhost\\n::1 localhost\\n"; + for label in ["dirfd_relative", "dotdot", "slash2", "curdir"] { + let needle = format!("'{label}': '{expected}'"); + assert!( + contents.contains(&needle), + "{label}: host /etc/hosts leaked. got: {contents}" + ); + } + assert!(result.success()); +} From ac61af35d31b2f7d1457a952e35db693a5c83ee2 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 26 May 2026 22:30:38 -0700 Subject: [PATCH 3/4] seccomp: normalize open-family paths for every shim, not just /etc/hosts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Last commit closed the literal-path bypass for /etc/hosts but the same 4 shapes (legacy `open`, `openat2`, dirfd-relative, non-canonical) still slipped past every other open-keyed shim and most of them are actual security boundaries: /proc/kcore (and friends in `is_sensitive_proc`) returns physical memory, the per-PID `/proc//` filter hides other sandboxes' state from each other, /proc/cpuinfo + /proc/meminfo + /proc/net/* fudge resource accounting, /etc/hostname leaks the host identity, and /dev/urandom bypasses the deterministic seed. Lift the `(dirfd, path)` → normalized absolute path logic out of `handle_etc_hosts_open` into a shared `procfs::resolve_open_target` that dispatches on `notif.data.nr` for open / openat / openat2. Convert `handle_proc_open`, `handle_hostname_open`, and `handle_random_open` to use it, and add `open_family_syscalls()` in the dispatch builder so each handler is registered for all three syscalls in one place. The BPF notif list already carries open and openat2 unconditionally (added by 0fc09b4), so the kernel produces a notification regardless of which spelling the caller picked. Add regression tests for the new coverage: the /proc/kcore deny, the /proc/cpuinfo virtualization, and the /etc/hostname shim each get a test that walks dirfd-relative, `..`-laden, repeated-slash, and `./`-laden spellings and asserts the shim still fires. Signed-off-by: Cong Wang --- crates/sandlock-core/src/procfs.rs | 79 ++++++++------- crates/sandlock-core/src/random.rs | 17 ++-- crates/sandlock-core/src/seccomp/dispatch.rs | 95 +++++++++++-------- .../tests/integration/test_determinism.rs | 39 ++++++++ .../tests/integration/test_procfs.rs | 95 +++++++++++++++++++ 5 files changed, 244 insertions(+), 81 deletions(-) diff --git a/crates/sandlock-core/src/procfs.rs b/crates/sandlock-core/src/procfs.rs index d7dd568..09b2ded 100644 --- a/crates/sandlock-core/src/procfs.rs +++ b/crates/sandlock-core/src/procfs.rs @@ -411,16 +411,21 @@ pub(crate) async fn handle_proc_open( policy: &NotifPolicy, notif_fd: RawFd, ) -> NotifAction { - // openat(dirfd, pathname, flags, mode) - // args[0] = dirfd, args[1] = pathname pointer - let path_ptr = notif.data.args[1]; - let path = match read_path(notif, path_ptr, notif_fd) { + // Resolve open/openat/openat2 to a normalized absolute path so the + // sensitive-path deny, the per-PID filter, and the virtualization + // string-matches below all see the same canonical form regardless of + // how the caller spelled it (dirfd-relative, `..`-laden, etc.). + let resolved = match resolve_open_target(notif, notif_fd) { + Some(p) => p, + None => return NotifAction::Continue, + }; + let path = match resolved.to_str() { Some(p) => p, None => return NotifAction::Continue, }; // Block sensitive paths. - if is_sensitive_proc(&path) { + if is_sensitive_proc(path) { return NotifAction::Errno(EACCES); } @@ -428,7 +433,7 @@ pub(crate) async fn handle_proc_open( // This complements the getdents64 PID filtering — directory listings // already hide non-sandbox PIDs, but without this check a process // could still open /proc/{ppid}/cmdline (or any guessed PID) directly. - if let Some(pid) = extract_proc_pid(&path) { + if let Some(pid) = extract_proc_pid(path) { if !processes.contains(pid) { return NotifAction::Errno(EACCES); } @@ -586,19 +591,19 @@ pub(crate) fn handle_uname( } } -/// Handle openat targeting /etc/hostname — return a memfd with the virtual hostname. +/// Handle open/openat/openat2 targeting /etc/hostname — return a memfd +/// with the virtual hostname. Path is resolved and lexically normalized +/// via [`resolve_open_target`] so dirfd-relative and non-canonical +/// spellings all hit the shim. pub(crate) fn handle_hostname_open( notif: &SeccompNotif, hostname: &str, notif_fd: RawFd, ) -> Option { - let path_ptr = notif.data.args[1]; - let path = read_path(notif, path_ptr, notif_fd)?; - - if path != "/etc/hostname" { + let resolved = resolve_open_target(notif, notif_fd)?; + if resolved != std::path::Path::new("/etc/hostname") { return None; } - let content = format!("{}\n", hostname); Some(inject_memfd(content.as_bytes())) } @@ -610,21 +615,37 @@ pub(crate) fn handle_hostname_open( /// `::1 localhost`) plus any concrete hostnames pre-resolved from /// `net_allow`, so the host's on-disk `/etc/hosts` never leaks in and /// glibc's `files` NSS backend resolves allowed hostnames without DNS. -/// -/// Coverage notes (the simple literal-path match used to miss these): -/// - Legacy `open(path, ...)`: dispatched by syscall number; no `dirfd`. -/// - `openat2(dirfd, path, struct open_how *, size)`: dispatched the same -/// way as `openat`; the trailing `open_how` argument is irrelevant for -/// the file-name match. -/// - Relative paths via `dirfd` (e.g. `openat(open("/etc"), "hosts")`): -/// the `dirfd` is resolved via `/proc//fd/`. -/// - Non-canonical absolutes (`/etc/../etc/hosts`, `//etc/hosts`, -/// `/etc/./hosts`): collapsed by lexical normalization before comparison. pub(crate) fn handle_etc_hosts_open( notif: &SeccompNotif, etc_hosts_content: &str, notif_fd: RawFd, ) -> Option { + let resolved = resolve_open_target(notif, notif_fd)?; + if resolved != std::path::Path::new("/etc/hosts") { + return None; + } + Some(inject_memfd(etc_hosts_content.as_bytes())) +} + +/// Resolve the path argument of an open-family syscall (`open`, `openat`, +/// or `openat2`) to a lexically-normalized absolute host-side path. +/// +/// Used by every `openat`-shaped handler so the security and +/// virtualization checks operate on the same canonical form regardless +/// of how the caller spelled the path. The literal-string compare used +/// before this helper missed four bypass shapes: legacy `open`, +/// `openat2`, dirfd-relative spellings like `openat(open("/etc"), +/// "hosts", ...)`, and non-canonical absolutes like `/etc/../etc/hosts` +/// or `//etc/hosts`. +/// +/// Returns `None` if the notif isn't an open variant, the path can't be +/// read from child memory, the dirfd can't be resolved, or the path +/// walks above `/`. Callers treat `None` as "fall through to the kernel" +/// (`NotifAction::Continue`). +pub(crate) fn resolve_open_target( + notif: &SeccompNotif, + notif_fd: RawFd, +) -> Option { let nr = notif.data.nr as i64; let (dirfd, path_ptr): (i64, u64) = if Some(nr) == crate::arch::SYS_OPEN { // open(path, flags, mode) — no dirfd, behaves as AT_FDCWD. @@ -636,17 +657,11 @@ pub(crate) fn handle_etc_hosts_open( } else { return None; }; - let path = read_path(notif, path_ptr, notif_fd)?; - let resolved = resolve_to_normalized_absolute(notif.pid, dirfd, &path)?; - if resolved != std::path::Path::new("/etc/hosts") { - return None; - } - - Some(inject_memfd(etc_hosts_content.as_bytes())) + resolve_to_normalized_absolute(notif.pid, dirfd, &path) } -/// Resolve `(pid, dirfd, path)` to a lexically-normalized absolute path. +/// Lexical normalization of `(pid, dirfd, path)`: /// /// - Absolute `path`: used as-is. /// - Relative `path` with `dirfd == AT_FDCWD`: prefixed with the child's @@ -655,8 +670,8 @@ pub(crate) fn handle_etc_hosts_open( /// target of `/proc//fd/` (the host kernel's view of the /// directory the dirfd points to). /// -/// Returns `None` if the dirfd cannot be resolved, or if the path walks -/// above the root via `..`. +/// Then collapses `.`, `..`, and redundant `/` components. Returns +/// `None` if the dirfd cannot be resolved or the path walks above `/`. fn resolve_to_normalized_absolute( pid: u32, dirfd: i64, diff --git a/crates/sandlock-core/src/random.rs b/crates/sandlock-core/src/random.rs index 05988c1..b84f199 100644 --- a/crates/sandlock-core/src/random.rs +++ b/crates/sandlock-core/src/random.rs @@ -15,7 +15,7 @@ use std::io::{Seek, SeekFrom, Write}; use std::os::fd::RawFd; use std::os::unix::io::{AsRawFd, FromRawFd}; -use crate::seccomp::notif::{read_child_cstr, write_child_mem, NotifAction}; +use crate::seccomp::notif::{write_child_mem, NotifAction}; use crate::sys::structs::SeccompNotif; use crate::sys::syscall; @@ -57,15 +57,12 @@ pub(crate) fn handle_random_open( rng: &mut ChaCha8Rng, notif_fd: RawFd, ) -> Option { - // openat(dirfd, pathname, flags, mode): args[1] = pathname pointer - let path_ptr = notif.data.args[1]; - if path_ptr == 0 { - return None; - } - - let path = read_child_cstr(notif_fd, notif.id, notif.pid, path_ptr, 4096)?; - - if path.as_str() != "/dev/urandom" && path.as_str() != "/dev/random" { + // Resolve the open path so dirfd-relative or non-canonical spellings + // (`/dev/../dev/urandom`, `openat(open("/dev"), "urandom", ...)`) + // can't sidestep the seed and read real kernel entropy. + let resolved = crate::procfs::resolve_open_target(notif, notif_fd)?; + let path = resolved.to_str()?; + if path != "/dev/urandom" && path != "/dev/random" { return None; } diff --git a/crates/sandlock-core/src/seccomp/dispatch.rs b/crates/sandlock-core/src/seccomp/dispatch.rs index acf1249..d423ee8 100644 --- a/crates/sandlock-core/src/seccomp/dispatch.rs +++ b/crates/sandlock-core/src/seccomp/dispatch.rs @@ -152,6 +152,20 @@ pub enum HandlerError { /// The blocklist is whatever [`crate::context::blocklist_syscall_numbers`] /// resolves from Sandlock's default syscall blocklist plus policy extras. /// +/// Every open-family syscall a path-keyed handler must intercept to be +/// leak-proof: `open` (legacy), `openat`, and `openat2`. A handler that +/// only registers `openat` is bypassed by any libc that picks one of +/// the others. The corresponding BPF notif list (in `context::notif_syscalls`) +/// must register the same set so the kernel actually produces a +/// notification — otherwise `RET_ALLOW` makes the handler unreachable. +fn open_family_syscalls() -> Vec { + let mut v = vec![libc::SYS_openat, arch::SYS_OPENAT2]; + if let Some(legacy_open) = arch::SYS_OPEN { + v.push(legacy_open); + } + v +} + /// Takes only the syscall numbers because that's all it needs to check. /// Called from the `run_with_handlers` entry points before any /// handler is registered against the dispatch table. @@ -352,24 +366,29 @@ pub(crate) fn build_dispatch_table( } // ------------------------------------------------------------------ - // Deterministic random — /dev/urandom opens (openat) + // Deterministic random — /dev/urandom and /dev/random opens. + // Registered for every open-family syscall so dirfd-relative and + // legacy `open` spellings can't slip past the seed and read the + // kernel's real entropy. // ------------------------------------------------------------------ if policy.has_random_seed { - let __sup = Arc::clone(ctx); - table.register(libc::SYS_openat, move |cx: &HandlerCtx| { - let notif = cx.notif; - let sup = Arc::clone(&__sup); - let notif_fd = cx.notif_fd; - async move { - let mut tr = sup.time_random.lock().await; - if let Some(ref mut rng) = tr.random_state { - if let Some(action) = crate::random::handle_random_open(¬if, rng, notif_fd) { - return action; + for nr in open_family_syscalls() { + let __sup = Arc::clone(ctx); + table.register(nr, move |cx: &HandlerCtx| { + let notif = cx.notif; + let sup = Arc::clone(&__sup); + let notif_fd = cx.notif_fd; + async move { + let mut tr = sup.time_random.lock().await; + if let Some(ref mut rng) = tr.random_state { + if let Some(action) = crate::random::handle_random_open(¬if, rng, notif_fd) { + return action; + } } + NotifAction::Continue } - NotifAction::Continue - } - }); + }); + } } // ------------------------------------------------------------------ @@ -407,13 +426,15 @@ pub(crate) fn build_dispatch_table( } // ------------------------------------------------------------------ - // /proc virtualization (always on) + // /proc virtualization (always on). The handler does both + // sensitive-path blocking and per-PID filtering, both of which are + // security boundaries, so it has to catch every open-family spelling. // ------------------------------------------------------------------ - { + for nr in open_family_syscalls() { let policy_for_proc_open = Arc::clone(policy); let resource_for_proc_open = Arc::clone(resource); let __sup = Arc::clone(ctx); - table.register(libc::SYS_openat, move |cx: &HandlerCtx| { + table.register(nr, move |cx: &HandlerCtx| { let notif = cx.notif; let sup = Arc::clone(&__sup); let notif_fd = cx.notif_fd; @@ -459,7 +480,9 @@ pub(crate) fn build_dispatch_table( } // ------------------------------------------------------------------ - // Hostname virtualization + // Hostname virtualization. The `/etc/hostname` shim is registered + // for every open-family syscall so dirfd-relative and legacy `open` + // spellings can't leak the host's real hostname. // ------------------------------------------------------------------ if let Some(ref hostname) = policy.virtual_hostname { let hostname_for_uname = hostname.clone(); @@ -472,38 +495,32 @@ pub(crate) fn build_dispatch_table( crate::procfs::handle_uname(¬if, &hostname, notif_fd) } }); - table.register(libc::SYS_openat, move |cx: &HandlerCtx| { - let notif = cx.notif; - let notif_fd = cx.notif_fd; + for nr in open_family_syscalls() { let hostname = hostname_for_open.clone(); - async move { - if let Some(action) = crate::procfs::handle_hostname_open(¬if, &hostname, notif_fd) { - action - } else { - NotifAction::Continue + table.register(nr, move |cx: &HandlerCtx| { + let notif = cx.notif; + let notif_fd = cx.notif_fd; + let hostname = hostname.clone(); + async move { + if let Some(action) = crate::procfs::handle_hostname_open(¬if, &hostname, notif_fd) { + action + } else { + NotifAction::Continue + } } - } - }); + }); + } } // ------------------------------------------------------------------ // /etc/hosts virtualization: always on. The synthetic file contains // the loopback base plus any concrete hostnames resolved from // `net_allow`, so the host's on-disk `/etc/hosts` never leaks in. - // - // Registered for `open`, `openat`, and `openat2` so the simple literal - // spelling, dirfd-relative spelling, and openat2 callers (some Go and - // hand-rolled runtimes) all funnel through the same path normalization - // in `handle_etc_hosts_open`. The handler itself reads - // `notif.data.nr` to extract the right argument slots. + // Registered for every open-family syscall (see `open_family_syscalls`). // ------------------------------------------------------------------ { let etc_hosts = policy.virtual_etc_hosts.clone(); - let mut open_syscalls: Vec = vec![libc::SYS_openat, arch::SYS_OPENAT2]; - if let Some(legacy_open) = arch::SYS_OPEN { - open_syscalls.push(legacy_open); - } - for nr in open_syscalls { + for nr in open_family_syscalls() { let etc_hosts = etc_hosts.clone(); table.register(nr, move |cx: &HandlerCtx| { let notif = cx.notif; diff --git a/crates/sandlock-core/tests/integration/test_determinism.rs b/crates/sandlock-core/tests/integration/test_determinism.rs index ae20c26..fb4b54b 100644 --- a/crates/sandlock-core/tests/integration/test_determinism.rs +++ b/crates/sandlock-core/tests/integration/test_determinism.rs @@ -222,3 +222,42 @@ async fn test_hostname_virtualization() { let stdout = String::from_utf8_lossy(result.stdout.as_deref().unwrap_or_default()); assert_eq!(stdout.trim(), "mybox", "Expected /etc/hostname 'mybox', got: {:?}", stdout.trim()); } + +/// The /etc/hostname shim used to do a literal `path == "/etc/hostname"` +/// match, so dirfd-relative and non-canonical spellings leaked the host's +/// real hostname. Exercise each bypass shape and assert the virtual +/// hostname comes back. +#[tokio::test] +async fn test_hostname_virtualization_resists_path_bypasses() { + let policy = Sandbox::builder() + .fs_read("/usr") + .fs_read("/lib") + .fs_read_if_exists("/lib64") + .fs_read("/bin") + .fs_read("/etc") + .build() + .unwrap(); + + let script = concat!( + "import os\n", + "results = {}\n", + "etcfd = os.open('/etc', os.O_DIRECTORY | os.O_RDONLY)\n", + "fd = os.open('hostname', os.O_RDONLY, dir_fd=etcfd)\n", + "results['dirfd'] = os.read(fd, 4096).decode().strip()\n", + "os.close(fd); os.close(etcfd)\n", + "results['dotdot'] = open('/etc/../etc/hostname').read().strip()\n", + "results['curdir'] = open('/etc/./hostname').read().strip()\n", + "results['slash2'] = open('//etc/hostname').read().strip()\n", + "print(results)\n", + ); + + let result = policy.clone().with_name("mybox").run(&["python3", "-c", script]).await.unwrap(); + let stdout = String::from_utf8_lossy(result.stdout.as_deref().unwrap_or_default()); + for label in ["dirfd", "dotdot", "curdir", "slash2"] { + let needle = format!("'{label}': 'mybox'"); + assert!( + stdout.contains(&needle), + "{label}: host /etc/hostname leaked. stdout: {stdout}" + ); + } +} diff --git a/crates/sandlock-core/tests/integration/test_procfs.rs b/crates/sandlock-core/tests/integration/test_procfs.rs index e0e81e8..1997777 100644 --- a/crates/sandlock-core/tests/integration/test_procfs.rs +++ b/crates/sandlock-core/tests/integration/test_procfs.rs @@ -72,6 +72,101 @@ async fn test_sensitive_proc_blocked() { assert!(!result.success(), "/proc/kcore should be denied"); } +/// The sensitive-path deny used to do a literal `path == "/proc/kcore"` +/// (and `starts_with("/proc/kcore/")`) match, which any non-canonical or +/// dirfd-relative spelling sidestepped. Exercise each known bypass shape +/// and assert the deny still fires. +#[tokio::test] +async fn test_sensitive_proc_resists_bypasses() { + let policy = Sandbox::builder() + .fs_read("/usr") + .fs_read("/lib") + .fs_read_if_exists("/lib64") + .fs_read("/bin") + .fs_read("/etc") + .fs_read("/proc") + .num_cpus(1) + .build() + .unwrap(); + + // EACCES (errno 13) is what the handler returns for sensitive paths. + // Each branch prints OK if the open was denied, FAIL otherwise. + let script = concat!( + "import os, errno\n", + "results = []\n", + "def must_deny(label, fn):\n", + " try:\n", + " fd = fn()\n", + " os.close(fd)\n", + " results.append(f'{label}:LEAKED')\n", + " except OSError as e:\n", + " results.append(f'{label}:DENIED' if e.errno == errno.EACCES else f'{label}:errno={e.errno}')\n", + // 1. dirfd-relative: open(/proc), then open 'kcore' relative to it + "procfd = os.open('/proc', os.O_DIRECTORY | os.O_RDONLY)\n", + "must_deny('dirfd', lambda: os.open('kcore', os.O_RDONLY, dir_fd=procfd))\n", + "os.close(procfd)\n", + // 2. non-canonical absolutes + "must_deny('dotdot', lambda: os.open('/proc/../proc/kcore', os.O_RDONLY))\n", + "must_deny('curdir', lambda: os.open('/proc/./kcore', os.O_RDONLY))\n", + "must_deny('slash2', lambda: os.open('//proc/kcore', os.O_RDONLY))\n", + "print('|'.join(results))\n", + ); + + let result = policy.clone().with_name("test").run(&["python3", "-c", script]).await.unwrap(); + let stdout = String::from_utf8_lossy(result.stdout.as_deref().unwrap_or_default()); + for label in ["dirfd", "dotdot", "curdir", "slash2"] { + let needle = format!("{label}:DENIED"); + assert!( + stdout.contains(&needle), + "{label}: /proc/kcore leaked via this spelling. stdout: {stdout}" + ); + } +} + +/// The /proc/cpuinfo virtualization used to do a literal +/// `path == "/proc/cpuinfo"` match, so non-canonical and dirfd-relative +/// spellings fell through to the host's real cpuinfo and leaked the host's +/// real CPU count. +#[tokio::test] +async fn test_proc_virt_resists_bypasses() { + let policy = Sandbox::builder() + .fs_read("/usr") + .fs_read("/lib") + .fs_read_if_exists("/lib64") + .fs_read("/bin") + .fs_read("/etc") + .fs_read("/proc") + .num_cpus(2) + .build() + .unwrap(); + + // Every spelling must see exactly 2 `^processor` lines, matching the + // synthetic cpuinfo. A leak to the host file would show this host's + // real CPU count (almost certainly != 2). + let script = concat!( + "import os\n", + "results = {}\n", + "procfd = os.open('/proc', os.O_DIRECTORY | os.O_RDONLY)\n", + "fd = os.open('cpuinfo', os.O_RDONLY, dir_fd=procfd)\n", + "results['dirfd'] = os.read(fd, 4096).decode().count('processor\\t')\n", + "os.close(fd); os.close(procfd)\n", + "results['dotdot'] = open('/proc/../proc/cpuinfo').read().count('processor\\t')\n", + "results['curdir'] = open('/proc/./cpuinfo').read().count('processor\\t')\n", + "results['slash2'] = open('//proc/cpuinfo').read().count('processor\\t')\n", + "print(results)\n", + ); + + let result = policy.clone().with_name("test").run(&["python3", "-c", script]).await.unwrap(); + let stdout = String::from_utf8_lossy(result.stdout.as_deref().unwrap_or_default()); + for label in ["dirfd", "dotdot", "curdir", "slash2"] { + let needle = format!("'{label}': 2"); + assert!( + stdout.contains(&needle), + "{label}: host cpuinfo leaked (expected 2 processors, virtualized). stdout: {stdout}" + ); + } +} + /// Test basic sandbox still works without /proc virtualization. #[tokio::test] async fn test_no_proc_virt_still_works() { From b92cade2dcf7f6d1415735faab229e7096f1ace1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 26 May 2026 22:42:54 -0700 Subject: [PATCH 4/4] network: seed virtual /etc/hosts from the image's file in chroot mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 3278445 always served the same fixed loopback-only /etc/hosts to every sandbox, which lost any entries an image author had baked in (private registries, internal services, etc.). Trade-off acknowledged at the time. This commit picks the alternative direction we discussed: keep the always-on loopback guarantee, but seed the synthetic file from the image's `/etc/hosts` when one is present, only injecting the loopback entries the image itself doesn't already cover, and appending net_allow concrete-host entries on top. Split the synthesis: `resolve_net_allow` now returns just the `concrete_host_entries` lines (one per resolved net_allow hostname), and a new `compose_virtual_etc_hosts(chroot_root, concrete_entries)` helper assembles the final file. Without a chroot it returns the fixed loopback base + concrete entries (matches the previous behavior). With a chroot, it reads `/etc/hosts` (falling back to the loopback base if absent or unreadable), checks which localhost families the image already covers, and injects only the missing ones — comments are stripped before the localhost-presence scan so an inline `#` in the image doesn't fool the detector. Move the `/etc/hosts` shim registration *above* the chroot dispatch so the synthetic memfd wins in chroot mode: otherwise the chroot handler's `openat2(RESOLVE_IN_ROOT)` opens `/etc/hosts` directly and the merge never reaches the sandbox. Tests: 6 new unit tests for `compose_virtual_etc_hosts` (no chroot, image with both / one / no loopback families, missing file, inline comment edge case) and 2 new integration tests that build a rootfs with a custom `/etc/hosts`, run a real sandbox under `--chroot`, and assert the merged content the sandbox actually sees (image entries preserved, loopback injected when missing, no duplicates when present). Signed-off-by: Cong Wang --- crates/sandlock-core/src/network.rs | 201 ++++++++++++++++-- crates/sandlock-core/src/sandbox.rs | 11 +- crates/sandlock-core/src/seccomp/dispatch.rs | 57 ++--- .../tests/integration/test_chroot.rs | 89 ++++++++ 4 files changed, 311 insertions(+), 47 deletions(-) diff --git a/crates/sandlock-core/src/network.rs b/crates/sandlock-core/src/network.rs index 76bb669..2310895 100644 --- a/crates/sandlock-core/src/network.rs +++ b/crates/sandlock-core/src/network.rs @@ -1037,11 +1037,12 @@ pub struct ResolvedNetAllowSet { pub tcp: ResolvedNetAllow, pub udp: ResolvedNetAllow, pub icmp: ResolvedNetAllow, - /// Synthetic `/etc/hosts` content. Always includes the loopback base - /// (`127.0.0.1 localhost` / `::1 localhost`) plus an entry per concrete - /// hostname appearing in any rule; the host's on-disk `/etc/hosts` never - /// leaks into the sandbox. - pub etc_hosts: String, + /// ` \n` lines from every concrete-host rule across + /// every protocol, in resolution order. Empty when no concrete-host + /// rules are present. Combined with the loopback base (or, in chroot + /// mode, the image's `/etc/hosts`) by [`compose_virtual_etc_hosts`] + /// to build the synthetic file served to the sandbox. + pub concrete_host_entries: String, } /// Resolve `--net-allow` rules into per-protocol runtime allowlists. @@ -1055,12 +1056,6 @@ pub struct ResolvedNetAllowSet { pub async fn resolve_net_allow( rules: &[NetAllow], ) -> io::Result { - // Single shared etc_hosts for all protocols. Always present so the - // sandbox sees a fixed loopback-only view independent of the host's - // on-disk `/etc/hosts`; concrete-hostname rules append their resolved - // entries on top. - let mut etc_hosts = String::from("127.0.0.1 localhost\n::1 localhost\n"); - let per_proto = |target: Protocol| async move { let mut per_ip: HashMap> = HashMap::new(); let mut per_ip_all_ports: HashSet = HashSet::new(); @@ -1121,18 +1116,78 @@ pub async fn resolve_net_allow( let (udp, udp_eh) = per_proto(Protocol::Udp).await?; let (icmp, icmp_eh) = per_proto(Protocol::Icmp).await?; + let mut concrete_host_entries = String::new(); for chunk in [tcp_eh, udp_eh, icmp_eh] { - etc_hosts.push_str(&chunk); + concrete_host_entries.push_str(&chunk); } Ok(ResolvedNetAllowSet { tcp, udp, icmp, - etc_hosts, + concrete_host_entries, }) } +/// Compose the synthetic `/etc/hosts` served to the sandbox. +/// +/// - **No chroot**: emit the fixed loopback base +/// (`127.0.0.1 localhost\n::1 localhost\n`) followed by the +/// concrete-host entries from [`resolve_net_allow`]. The sandbox sees +/// the same baseline regardless of what the host's on-disk file says. +/// - **With chroot**: read `/etc/hosts` and use it as the base +/// (an image that bakes in private-registry entries or similar keeps +/// them). Inject loopback entries only for any localhost family the +/// image doesn't already cover — never both, so we don't duplicate +/// what the image already has. Concrete-host entries are still +/// appended on top. +/// +/// If a chroot is set but `/etc/hosts` is unreadable (absent, +/// permission denied, etc.), fall back to the bare loopback base — the +/// sandbox always sees a usable hosts file. +pub fn compose_virtual_etc_hosts( + chroot_root: Option<&std::path::Path>, + concrete_host_entries: &str, +) -> String { + let mut out = String::new(); + let mut has_v4_localhost = false; + let mut has_v6_localhost = false; + + if let Some(root) = chroot_root { + if let Ok(image) = std::fs::read_to_string(root.join("etc").join("hosts")) { + for line in image.lines() { + // Strip an inline `#` comment before tokenizing — the + // hosts(5) format treats everything after `#` as a comment. + let stripped = line.split('#').next().unwrap_or(""); + let mut parts = stripped.split_whitespace(); + let Some(ip) = parts.next() else { continue }; + for name in parts { + if name == "localhost" { + if ip == "127.0.0.1" { + has_v4_localhost = true; + } else if ip == "::1" { + has_v6_localhost = true; + } + } + } + } + out.push_str(&image); + if !out.is_empty() && !out.ends_with('\n') { + out.push('\n'); + } + } + } + + if !has_v4_localhost { + out.push_str("127.0.0.1 localhost\n"); + } + if !has_v6_localhost { + out.push_str("::1 localhost\n"); + } + out.push_str(concrete_host_entries); + out +} + // ============================================================ // Tests // ============================================================ @@ -1317,10 +1372,8 @@ mod tests { assert!(resolved.tcp.any_ip_ports.is_empty()); assert!(resolved.udp.per_ip.is_empty()); assert!(resolved.icmp.per_ip.is_empty()); - // Loopback base is always emitted, even without rules, so the - // sandbox's `/etc/hosts` never falls through to the host's copy. - assert!(resolved.etc_hosts.contains("127.0.0.1 localhost")); - assert!(resolved.etc_hosts.contains("::1 localhost")); + // No concrete-host rules → no resolved-entry lines. + assert!(resolved.concrete_host_entries.is_empty()); } #[tokio::test] @@ -1341,8 +1394,8 @@ mod tests { } assert!(resolved.udp.per_ip.is_empty()); assert!(resolved.icmp.per_ip.is_empty()); - // Concrete-host entry (` localhost`) appended on top of the base. - assert!(resolved.etc_hosts.contains("127.0.0.1 localhost")); + // The resolved entry (` localhost`) surfaces in concrete_host_entries. + assert!(resolved.concrete_host_entries.contains("127.0.0.1 localhost")); } #[tokio::test] @@ -1357,8 +1410,8 @@ mod tests { assert!(resolved.tcp.per_ip.is_empty()); assert!(resolved.tcp.any_ip_ports.contains(&8080)); assert!(!resolved.tcp.any_ip_all_ports); - // Even an any-IP rule still gets the loopback base content. - assert!(resolved.etc_hosts.contains("127.0.0.1 localhost")); + // Any-IP rule has no concrete host, so no resolved-entry line. + assert!(resolved.concrete_host_entries.is_empty()); } #[tokio::test] @@ -1398,7 +1451,7 @@ mod tests { for ip in resolved.tcp.per_ip_all_ports.iter() { assert!(resolved.tcp.per_ip.contains_key(ip)); } - assert!(resolved.etc_hosts.contains("localhost")); + assert!(resolved.concrete_host_entries.contains("localhost")); } #[tokio::test] @@ -1503,4 +1556,108 @@ mod tests { assert!(resolved.icmp.any_ip_all_ports); assert!(!resolved.tcp.any_ip_all_ports); } + + // ============================================================ + // compose_virtual_etc_hosts — synthetic /etc/hosts assembly + // ============================================================ + + use std::io::Write; + + fn temp_rootfs_with_hosts(name: &str, hosts_content: Option<&str>) -> std::path::PathBuf { + let dir = std::env::temp_dir().join(format!( + "sandlock-test-compose-hosts-{}-{}", + name, std::process::id() + )); + let _ = std::fs::create_dir_all(dir.join("etc")); + if let Some(content) = hosts_content { + let mut f = std::fs::File::create(dir.join("etc").join("hosts")).unwrap(); + f.write_all(content.as_bytes()).unwrap(); + } + dir + } + + #[test] + fn compose_no_chroot_emits_loopback_base() { + // Default path — no chroot, no concrete-host rules → the same + // fixed loopback view we promise every sandbox. + let out = compose_virtual_etc_hosts(None, ""); + assert_eq!(out, "127.0.0.1 localhost\n::1 localhost\n"); + } + + #[test] + fn compose_no_chroot_appends_concrete_entries() { + let out = compose_virtual_etc_hosts(None, "10.0.0.1 api\n"); + assert_eq!(out, "127.0.0.1 localhost\n::1 localhost\n10.0.0.1 api\n"); + } + + #[test] + fn compose_chroot_seeds_from_image_and_injects_missing_loopback() { + // Image ships an entry of its own but no localhost mapping; the + // shim must keep the image's content and inject both loopback + // entries on top so the always-on guarantee still holds. + let rootfs = temp_rootfs_with_hosts( + "no-localhost", + Some("10.0.0.5 myimage.local\n"), + ); + let out = compose_virtual_etc_hosts(Some(&rootfs), ""); + assert!(out.contains("10.0.0.5 myimage.local"), "image entry missing: {out}"); + assert!(out.contains("127.0.0.1 localhost"), "v4 loopback missing: {out}"); + assert!(out.contains("::1 localhost"), "v6 loopback missing: {out}"); + let _ = std::fs::remove_dir_all(&rootfs); + } + + #[test] + fn compose_chroot_does_not_duplicate_existing_loopback() { + // Image already has both loopback entries — don't append duplicates. + let rootfs = temp_rootfs_with_hosts( + "both-localhost", + Some("127.0.0.1 localhost\n::1 localhost\n10.0.0.5 myimage.local\n"), + ); + let out = compose_virtual_etc_hosts(Some(&rootfs), ""); + assert_eq!(out.matches("127.0.0.1 localhost").count(), 1, "v4 dup'd: {out}"); + assert_eq!(out.matches("::1 localhost").count(), 1, "v6 dup'd: {out}"); + assert!(out.contains("10.0.0.5 myimage.local")); + let _ = std::fs::remove_dir_all(&rootfs); + } + + #[test] + fn compose_chroot_injects_only_missing_family() { + // Image has v4 but no v6 localhost — inject only v6, leave v4 alone. + let rootfs = temp_rootfs_with_hosts( + "only-v4-localhost", + Some("127.0.0.1 localhost myimage\n"), + ); + let out = compose_virtual_etc_hosts(Some(&rootfs), ""); + assert_eq!(out.matches("127.0.0.1 localhost").count(), 1); + assert!(out.contains("::1 localhost"), "v6 loopback should be injected: {out}"); + let _ = std::fs::remove_dir_all(&rootfs); + } + + #[test] + fn compose_chroot_missing_file_falls_back_to_loopback() { + // Chroot exists but has no /etc/hosts — fall back to the bare + // loopback base so the sandbox always sees a usable file. + let rootfs = temp_rootfs_with_hosts("no-file", None); + let out = compose_virtual_etc_hosts(Some(&rootfs), "10.0.0.1 api\n"); + assert_eq!(out, "127.0.0.1 localhost\n::1 localhost\n10.0.0.1 api\n"); + let _ = std::fs::remove_dir_all(&rootfs); + } + + #[test] + fn compose_chroot_strips_inline_comments_when_detecting_loopback() { + // hosts(5) treats `#` as a comment-start; the loopback-presence + // check must respect it (otherwise an image line like + // `127.0.0.1 # localhost` would be falsely treated as covering v4). + let rootfs = temp_rootfs_with_hosts( + "with-comments", + Some("127.0.0.1 # localhost is a comment here\n"), + ); + let out = compose_virtual_etc_hosts(Some(&rootfs), ""); + // Real `127.0.0.1 localhost` line must still be injected. + assert!( + out.lines().any(|l| l.trim() == "127.0.0.1 localhost"), + "v4 loopback should still be injected: {out}" + ); + let _ = std::fs::remove_dir_all(&rootfs); + } } diff --git a/crates/sandlock-core/src/sandbox.rs b/crates/sandlock-core/src/sandbox.rs index 0d4ceec..0f32d4a 100644 --- a/crates/sandlock-core/src/sandbox.rs +++ b/crates/sandlock-core/src/sandbox.rs @@ -1174,7 +1174,16 @@ impl Sandbox { let resolved_net_allow = network::resolve_net_allow(&self.net_allow) .await .map_err(SandboxRuntimeError::Io)?; - let virtual_etc_hosts = resolved_net_allow.etc_hosts.clone(); + // In chroot/image mode, seed the synthetic /etc/hosts from the + // rootfs's own file so entries baked into the image (private + // registries, internal hostnames, etc.) survive virtualization. + // Without a chroot, the helper returns the fixed loopback base. + // Either way, concrete-host rules from `net_allow` are appended + // on top. + let virtual_etc_hosts = network::compose_virtual_etc_hosts( + self.chroot.as_deref(), + &resolved_net_allow.concrete_host_entries, + ); if !self.http_allow.is_empty() || !self.http_deny.is_empty() { let handle = crate::http_acl::spawn_http_acl_proxy( diff --git a/crates/sandlock-core/src/seccomp/dispatch.rs b/crates/sandlock-core/src/seccomp/dispatch.rs index d423ee8..8cf5740 100644 --- a/crates/sandlock-core/src/seccomp/dispatch.rs +++ b/crates/sandlock-core/src/seccomp/dispatch.rs @@ -411,6 +411,38 @@ pub(crate) fn build_dispatch_table( } } + // ------------------------------------------------------------------ + // /etc/hosts virtualization: always on. The synthetic file contains + // the loopback base (or, in chroot/image mode, the image's own + // `/etc/hosts` merged with a loopback fallback) plus any concrete + // hostnames resolved from `net_allow`, so the host's on-disk + // `/etc/hosts` never leaks in and image-baked entries are preserved. + // + // Registered for every open-family syscall (see `open_family_syscalls`). + // Must run *before* the chroot handler so that in chroot mode the + // synthetic memfd wins over a direct open of `/etc/hosts` — + // the chroot handler always intercepts opens within the chroot and + // would otherwise serve the raw image file, defeating the merge. + // ------------------------------------------------------------------ + { + let etc_hosts = policy.virtual_etc_hosts.clone(); + for nr in open_family_syscalls() { + let etc_hosts = etc_hosts.clone(); + table.register(nr, move |cx: &HandlerCtx| { + let notif = cx.notif; + let notif_fd = cx.notif_fd; + let etc_hosts = etc_hosts.clone(); + async move { + if let Some(action) = crate::procfs::handle_etc_hosts_open(¬if, &etc_hosts, notif_fd) { + action + } else { + NotifAction::Continue + } + } + }); + } + } + // ------------------------------------------------------------------ // Chroot path interception (before COW) // ------------------------------------------------------------------ @@ -512,30 +544,7 @@ pub(crate) fn build_dispatch_table( } } - // ------------------------------------------------------------------ - // /etc/hosts virtualization: always on. The synthetic file contains - // the loopback base plus any concrete hostnames resolved from - // `net_allow`, so the host's on-disk `/etc/hosts` never leaks in. - // Registered for every open-family syscall (see `open_family_syscalls`). - // ------------------------------------------------------------------ - { - let etc_hosts = policy.virtual_etc_hosts.clone(); - for nr in open_family_syscalls() { - let etc_hosts = etc_hosts.clone(); - table.register(nr, move |cx: &HandlerCtx| { - let notif = cx.notif; - let notif_fd = cx.notif_fd; - let etc_hosts = etc_hosts.clone(); - async move { - if let Some(action) = crate::procfs::handle_etc_hosts_open(¬if, &etc_hosts, notif_fd) { - action - } else { - NotifAction::Continue - } - } - }); - } - } + // /etc/hosts is registered above the chroot block — see the comment there. // ------------------------------------------------------------------ // Deterministic directory listing diff --git a/crates/sandlock-core/tests/integration/test_chroot.rs b/crates/sandlock-core/tests/integration/test_chroot.rs index 62f58ba..7fda877 100644 --- a/crates/sandlock-core/tests/integration/test_chroot.rs +++ b/crates/sandlock-core/tests/integration/test_chroot.rs @@ -537,3 +537,92 @@ async fn test_fs_mount_read_write() { let _ = fs::remove_dir_all(&rootfs); let _ = fs::remove_dir_all(&work_dir); } + +/// When the chroot image ships its own `/etc/hosts`, the synthetic file +/// the sandbox sees should be seeded from the image's content (so any +/// private-registry / internal-service entries the image baked in +/// survive virtualization) — with loopback entries added only if the +/// image didn't already provide them. +#[tokio::test] +async fn test_chroot_etc_hosts_seeded_from_image() { + let rootfs = build_test_rootfs("etc-hosts-image-seed"); + // Put a hosts file in the image that has its own entry but is + // missing both loopback families on purpose. + fs::write( + rootfs.join("etc/hosts"), + "10.0.0.5 internal.registry.local\n", + ) + .unwrap(); + + let policy = minimal_exec_policy(&rootfs) + .fs_read("/etc") + .build() + .unwrap(); + + let result = policy.clone().with_name("test").run(&["rootfs-helper", "cat", "/etc/hosts"]).await; + match result { + Ok(r) => { + assert!( + r.success(), + "cat /etc/hosts should succeed inside chroot, stderr: {}", + r.stderr_str().unwrap_or("") + ); + let stdout = r.stdout_str().unwrap_or("").to_string(); + assert!( + stdout.contains("10.0.0.5 internal.registry.local"), + "image's /etc/hosts entry missing: {stdout}" + ); + assert!( + stdout.contains("127.0.0.1 localhost"), + "v4 loopback should be injected for the image: {stdout}" + ); + assert!( + stdout.contains("::1 localhost"), + "v6 loopback should be injected for the image: {stdout}" + ); + } + Err(e) => eprintln!("Chroot test skipped: {}", e), + } + + cleanup_rootfs(&rootfs); +} + +/// When the image already ships loopback entries, the synthesizer must +/// not duplicate them — the sandbox should see exactly the image's view +/// of localhost, not two copies of it. +#[tokio::test] +async fn test_chroot_etc_hosts_no_duplicate_loopback() { + let rootfs = build_test_rootfs("etc-hosts-no-dup"); + fs::write( + rootfs.join("etc/hosts"), + "127.0.0.1 localhost\n::1 localhost\n10.0.0.5 svc.local\n", + ) + .unwrap(); + + let policy = minimal_exec_policy(&rootfs) + .fs_read("/etc") + .build() + .unwrap(); + + let result = policy.clone().with_name("test").run(&["rootfs-helper", "cat", "/etc/hosts"]).await; + match result { + Ok(r) => { + assert!(r.success(), "cat /etc/hosts should succeed"); + let stdout = r.stdout_str().unwrap_or("").to_string(); + assert_eq!( + stdout.matches("127.0.0.1 localhost").count(), + 1, + "v4 loopback duplicated: {stdout}" + ); + assert_eq!( + stdout.matches("::1 localhost").count(), + 1, + "v6 loopback duplicated: {stdout}" + ); + assert!(stdout.contains("10.0.0.5 svc.local"), "image entry missing: {stdout}"); + } + Err(e) => eprintln!("Chroot test skipped: {}", e), + } + + cleanup_rootfs(&rootfs); +}