Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions scripts/pre-commit.hook
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,8 @@ build_cppcheck_suppressions() {
"syntaxError:src/loader-transfer.c"
"invalidFunctionArg:src/rewrite.c"
"invalidFunctionArg:tests/unit/test-procmem.c"
"invalidFunctionArg:tests/guest/signal-safety-test.c"
"nullPointerOutOfMemory:tests/guest/signal-safety-test.c"
"nullPointerArithmeticOutOfMemory:tests/unit/test-procmem.c"
"nullPointerOutOfMemory:tests/unit/test-procmem.c"
"knownConditionTrueFalse"
Expand Down
2 changes: 1 addition & 1 deletion scripts/run-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ expect_success "umask-test" \
echo ""
echo "--- Guest test programs ---"

for test_prog in dup-test clock-test signal-test path-escape-test errno-test; do
for test_prog in dup-test clock-test signal-test signal-safety-test path-escape-test errno-test; do
if guest_has_test "$test_prog"; then
expect_success "$test_prog" \
"$KBOX" -S "$ROOTFS" -- "/opt/tests/${test_prog}"
Expand Down
1 change: 1 addition & 0 deletions src/image.c
Original file line number Diff line number Diff line change
Expand Up @@ -833,6 +833,7 @@ int kbox_run_image(const struct kbox_image_args *args)
command = args->command ? args->command : "/bin/sh";
probe_mode = args->syscall_mode;
rewrite_requested = args->syscall_mode == KBOX_SYSCALL_MODE_REWRITE;
setenv("KBOX_SYSCALL_MODE", kbox_syscall_mode_name(args->syscall_mode), 1);

/* AUTO enables rewrite analysis for non-shell commands so the
* auto_prefers_userspace_fast_path() selection function can see the
Expand Down
60 changes: 60 additions & 0 deletions src/procmem.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,48 @@
/* Process memory access for seccomp-unotify.
*
* Wraps process_vm_readv/writev to read/write tracee memory without ptrace.
*
* Signal safety contract
* ----------------------
* Signal-visible globals in this file:
*
* fault_armed (__thread volatile sig_atomic_t)
* Read by fault_handler (SIGSEGV/SIGBUS). Written only by
* safe_memcpy / kbox_current_read_string on the same thread,
* outside the handler. sig_atomic_t + volatile guarantees
* atomic visibility. Thread-local: no cross-thread concern.
*
* fault_jmp (__thread sigjmp_buf)
* Target of siglongjmp in fault_handler. siglongjmp is
* async-signal-safe per POSIX. Thread-local.
*
* fault_handler_gen (volatile unsigned, __atomic ops)
* Bumped by kbox_procmem_signal_changed() (dispatch thread)
* when the guest calls rt_sigaction(SIGSEGV/SIGBUS). Read
* by safe_memcpy via __atomic_load_n. Never accessed from
* a signal handler.
*
* saved_guest_segv, saved_guest_bus (static struct sigaction)
* Read by fault_handler to forward non-kbox faults to the
* guest's handler. Written by install_fault_handler() on
* the dispatch thread. Safe because the single-threaded
* guest constraint (CLONE_THREAD returns ENOSYS) serializes
* rt_sigaction dispatch and safe_memcpy through the same
* thread. If multi-threaded guest support is added, these
* must be protected by a lock or made thread-local.
*
* Functions called from signal handler context:
* fault_handler -- reads fault_armed, calls siglongjmp or
* forwards to guest handler.
* restore_default_and_reraise -- sigaction + raise, both POSIX
* async-signal-safe.
*
* The fault_armed window in safe_memcpy / kbox_current_read_string:
* sigsetjmp -> fault_armed=1 -> memory access -> fault_armed=0
* If a SIGSEGV/SIGBUS arrives while fault_armed=1, siglongjmp
* returns to the sigsetjmp site and returns -EFAULT. If a signal
* arrives while fault_armed=0, it is forwarded to the guest's
* saved handler (or SIG_DFL -> core dump).
*/

#include <errno.h>
Expand Down Expand Up @@ -57,6 +99,24 @@ static __thread unsigned

/* Saved guest handlers: when kbox installs its fault handler, the guest's
* prior handlers are preserved here and forwarded to when fault_armed is 0.
*
* Race safety (Go runtime SIGSEGV for stack growth):
*
* When the guest calls rt_sigaction(SIGSEGV, new_handler), kbox's dispatch:
* 1. Bumps fault_handler_gen (kbox_procmem_signal_changed).
* 2. Returns CONTINUE -- host kernel installs guest's new_handler.
*
* At this point kbox's fault_handler is no longer installed. However,
* the next safe_memcpy detects the generation mismatch BEFORE arming
* fault_armed, and re-installs kbox's handler (saving the guest's
* new_handler here). Between steps 2 and the reinstall, no safe_memcpy
* is in progress (fault_armed == 0), so any SIGSEGV in that window goes
* directly to the guest's handler -- which is the correct behavior.
*
* This ordering guarantee depends on the single-threaded guest
* constraint (CLONE_THREAD returns ENOSYS). With multi-threaded
* guests, a concurrent safe_memcpy on another thread could see a
* stale saved_guest_segv. These would need to become per-thread.
*/
static struct sigaction saved_guest_segv;
static struct sigaction saved_guest_bus;
Expand Down
47 changes: 43 additions & 4 deletions src/seccomp-dispatch.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,22 @@
* This is the beating heart of kbox: every file open, read, write, stat, and
* directory operation the tracee makes gets routed through here.
*
* Single-threaded dispatch contract
* ----------------------------------
* All notification processing runs on a single thread (supervisor loop in
* seccomp mode, SIGSYS handler in trap/rewrite mode). The following state is
* protected only by this single-threaded invariant:
*
* dispatch_scratch[] Static I/O buffer (this file).
* kbox_fd_table entries FD table (fd-table.c).
* Path scratch buffers Translation/literal caches (path.c).
* shadow_sockets[] SLIRP socket array (net-slirp.c).
* saved_guest_segv/bus Fault handler saved actions (procmem.c).
* fault_armed Thread-local; single-threaded guest means
* only one thread ever arms it.
*
* If parallel dispatch or multi-threaded guest support is introduced,
* every item above needs locking or per-thread allocation.
*/

#include <errno.h>
Expand Down Expand Up @@ -126,6 +142,9 @@ static struct kbox_dispatch emulate_trap_rt_sigprocmask(
unsigned char set_mask[sizeof(sigset_t)];
size_t mask_len;

memset(current, 0, sizeof(current));
memset(next, 0, sizeof(next));

if (sigset_size == 0 || sigset_size > sizeof(current))
return kbox_dispatch_errno(EINVAL);
mask_len = sigset_size;
Expand Down Expand Up @@ -153,8 +172,15 @@ static struct kbox_dispatch emulate_trap_rt_sigprocmask(
}

if (old_ptr != 0) {
/* Strip SIGSYS from the reported mask -- the guest must not
* observe kbox's reserved signal in its signal state.
*/
unsigned char visible[sizeof(sigset_t)];

memcpy(visible, current, sizeof(visible));
kbox_syscall_trap_sigset_strip_reserved(visible, sizeof(visible));
int rc = guest_mem_write(ctx, kbox_syscall_request_pid(req), old_ptr,
current, mask_len);
visible, mask_len);
if (rc < 0)
return kbox_dispatch_errno(-rc);
}
Expand Down Expand Up @@ -213,15 +239,18 @@ static struct kbox_dispatch emulate_trap_rt_sigpending(
unsigned char pending[sizeof(sigset_t)];
int rc;

(void) ctx;

if (set_ptr == 0)
return kbox_dispatch_errno(EFAULT);
if (sigset_size == 0 || sigset_size > sizeof(pending))
return kbox_dispatch_errno(EINVAL);
if (kbox_syscall_trap_get_pending(pending, sizeof(pending)) < 0)
return kbox_dispatch_errno(EIO);

/* Strip SIGSYS from pending set -- the guest must not observe
* kbox's reserved signal as pending.
*/
kbox_syscall_trap_sigset_strip_reserved(pending, sizeof(pending));

rc = guest_mem_write(ctx, kbox_syscall_request_pid(req), set_ptr, pending,
sigset_size);
if (rc < 0)
Expand Down Expand Up @@ -4405,7 +4434,17 @@ struct kbox_dispatch kbox_dispatch_request(
return kbox_dispatch_value(0);
}

/* Signals. */
/* Signals.
*
* rt_sigaction: only SIGSYS is denied (reserved for trap mode).
* All other signals -- including SIGURG (Go async preemption),
* SIGUSR1/2, SIGALRM, etc. -- pass through to the host kernel
* via CONTINUE. SIGSEGV/SIGBUS changes bump the fault handler
* generation counter so procmem.c reinstalls its handler.
*
* rt_sigprocmask: in trap/rewrite mode, emulated to keep the
* supervisor's SIGSYS unblocked. In seccomp mode, CONTINUE.
*/

if (nr == h->rt_sigaction) {
if (request_uses_trap_signals(req) &&
Expand Down
1 change: 1 addition & 0 deletions src/syscall-trap-signal.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ struct kbox_syscall_trap_ip_range {
int kbox_syscall_trap_reserved_signal(void);
int kbox_syscall_trap_signal_is_reserved(int signum);
int kbox_syscall_trap_sigset_blocks_reserved(const void *mask, size_t len);
void kbox_syscall_trap_sigset_strip_reserved(void *mask, size_t len);
uintptr_t kbox_syscall_trap_host_syscall_ip(void);
int kbox_syscall_trap_host_syscall_range(
struct kbox_syscall_trap_ip_range *out);
Expand Down
49 changes: 49 additions & 0 deletions src/syscall-trap.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,42 @@
/* SPDX-License-Identifier: MIT */

/* Syscall trap runtime: SIGSYS handler installation and dispatch.
*
* Signal safety contract
* ----------------------
* Signal-visible globals:
*
* active_trap_runtime (static pointer, atomic load/store)
* Read by trap_sigsys_handler via __atomic_load_n (ACQUIRE).
* Written by install/uninstall via __atomic_store_n (RELEASE).
* Plain pointer load is async-signal-safe.
*
* have_fsgsbase (static int)
* Written once at startup by probe_fsgsbase(). Read in
* read/write_host_fs_base helpers. One-shot init; never
* modified after the first probe.
*
* The SIGSYS handler (trap_sigsys_handler) runs on the guest thread.
* It must avoid:
* - Heap allocation (guest may hold glibc malloc locks).
* All dispatch buffers are static (dispatch_scratch in
* seccomp-dispatch.c).
* - Stack protector (guest FS base != kbox FS base on x86_64).
* Handler is __attribute__((no_stack_protector)).
* - ASAN instrumentation (ASAN runtime syscalls hit the BPF
* filter from unregistered IPs). Handler is
* __attribute__((no_sanitize("address"))).
*
* The handler restores kbox's FS base before calling into C dispatch
* code, then restores the guest's FS base before returning. This
* swap is safe because the guest is single-threaded (CLONE_THREAD
* returns ENOSYS in trap/rewrite mode).
*
* If multi-threaded guest support is added, the following must be
* revisited: active_trap_runtime (must become per-thread), FS base
* swap (must be per-thread), and all static dispatch buffers.
*/

#include <errno.h>
#include <limits.h>
#include <pthread.h>
Expand Down Expand Up @@ -878,6 +915,18 @@ int kbox_syscall_trap_sigset_blocks_reserved(const void *mask, size_t len)
return (bytes[byte_index] & (1U << bit_index)) != 0;
}

void kbox_syscall_trap_sigset_strip_reserved(void *mask, size_t len)
{
unsigned char *bytes = mask;
unsigned int signo = (unsigned int) kbox_syscall_trap_reserved_signal();
unsigned int bit = signo - 1U;
unsigned int byte_index = bit / 8U;

if (!mask || len <= byte_index)
return;
bytes[byte_index] &= (unsigned char) ~(1U << (bit % 8U));
}

uintptr_t kbox_syscall_trap_host_syscall_ip(void)
{
#if defined(__x86_64__) || defined(__aarch64__) || \
Expand Down
Loading
Loading