Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,19 @@ jobs:
- name: boot test
run: .ci/autorun.sh
timeout-minutes: 5
# Smoke-test the --seccomp opt-in path on the same nested-KVM runner.
# The smoke test only waits for the "Linux version " banner before
# sending Ctrl-A x, but reaching the banner already exercises
# prctl(PR_SET_NO_NEW_PRIVS), seccomp(2)+TSYNC install over the
# already-running serial worker, and the early KVM_RUN dispatch
# under the filter. A regression that drops a steady-state syscall
# from src/seccomp.c's allowlist surfaces here as a SIGSYS before
# the banner appears.
- name: boot test (seccomp)
run: .ci/autorun.sh
env:
KVM_HOST_FLAGS: --seccomp
timeout-minutes: 5

# arm64 host build: configs/linux-arm64.config has no prebuilt path and
# the GitHub-hosted ubuntu-24.04-arm runners (Cobalt 100) do NOT expose
Expand Down
9 changes: 8 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ OBJS := \
virtio-blk.o \
virtio-net.o \
diskimg.o \
seccomp.o \
main.o

ifeq ($(ARCH), x86_64)
Expand Down Expand Up @@ -66,9 +67,15 @@ $(OUT)/ext4.img:
$(Q)dd if=/dev/zero of=$@ bs=4k count=600
$(Q)mkfs.ext4 -F $@

# KVM_HOST_FLAGS forwards extra flags to the binary so CI and developers
# can opt into --seccomp without duplicating the recipe. Empty by default
# to keep `make check` matching the documented invocation.
KVM_HOST_FLAGS ?=

check: $(BIN) $(LINUX_IMG) $(ROOTFS_IMG) $(OUT)/ext4.img
$(VECHO) "\nOnce the message 'Kernel panic' appears, press Ctrl-C to exit\n\n"
$(Q)sudo $(BIN) -k $(LINUX_IMG) -i $(ROOTFS_IMG) -d $(OUT)/ext4.img
$(Q)sudo $(BIN) -k $(LINUX_IMG) -i $(ROOTFS_IMG) -d $(OUT)/ext4.img \
$(KVM_HOST_FLAGS)

clean:
$(VECHO) "Cleaning...\n"
Expand Down
19 changes: 18 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,31 @@ make check
### Start Emulator

```
$ build/kvm-host -k bzImage [-i initrd] [-d disk-image]
$ build/kvm-host -k bzImage [-i initrd] [-d disk-image] [--seccomp]
```

`bzImage` is the path to linux kernel bzImage. The bzImage file is in a specific format,
containing concatenated `bootsect.o + setup.o + misc.o + piggy.o`. `initrd` is the path to
initial RAM disk image, which is an optional argument.
`disk-image` is the path to disk image which can be mounted as a block device via virtio. For the reference Linux guest, ext4 filesystem is used for disk image.

`--seccomp` is an opt-in defense-in-depth flag that installs a seccomp BPF
allowlist over the steady-state KVM_RUN loop. Once active, only the
syscalls that the vcpu, virtio-blk, virtio-net, and serial workers need
are permitted; anything else (including a memory-corruption RCE in
device emulation pivoting to `execve`, `open`, or `socket`) terminates
the process with `SIGSYS`. The filter is applied via `seccomp(2)` with
`SECCOMP_FILTER_FLAG_TSYNC` so already-running worker threads inherit
it. The flag is off by default so existing test and development
workflows are unaffected. CI exercises both paths
(`.github/workflows/main.yml`).

To run `make check` with the filter enabled:

```shell
$ make KVM_HOST_FLAGS=--seccomp check
```

### Exit Emulator

To exit kvm-host, press "Ctrl-A", release both keys, and then press "x".
Expand Down
30 changes: 25 additions & 5 deletions src/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,18 @@
#include <unistd.h>

#include "err.h"
#include "seccomp.h"
#include "vm.h"

static char *kernel_file = NULL, *initrd_file = NULL, *diskimg_file = NULL;
static int enable_seccomp = 0;

/* Long-only option ids start above the ASCII range so they can never collide
* with a short-option char in the getopt_long return.
*/
enum {
OPT_SECCOMP = 256,
};

#define print_option(args, help_msg) printf(" %-30s%s", args, help_msg)

Expand All @@ -19,6 +28,8 @@ static void usage(const char *execpath)
print_option("-i, --initrd initrd", "Initial RAM disk image\n");
print_option("-d, --disk disk-image",
"Disk image for virtio-blk devices\n");
print_option("--seccomp",
"Install a seccomp BPF allowlist before vm_run.\n");
}

static struct termios saved_attributes;
Expand Down Expand Up @@ -50,9 +61,8 @@ int main(int argc, char *argv[])
{
int option_index = 0;
struct option opts[] = {
{"kernel", 1, NULL, 'k'},
{"initrd", 1, NULL, 'i'},
{"disk", 1, NULL, 'd'},
{"kernel", 1, NULL, 'k'}, {"initrd", 1, NULL, 'i'},
{"disk", 1, NULL, 'd'}, {"seccomp", 0, NULL, OPT_SECCOMP},
{"help", 0, NULL, 'h'},
};

Expand All @@ -69,6 +79,9 @@ int main(int argc, char *argv[])
case 'd':
diskimg_file = optarg;
break;
case OPT_SECCOMP:
enable_seccomp = 1;
break;
case 'h':
usage(argv[0]);
exit(0);
Expand Down Expand Up @@ -97,8 +110,15 @@ int main(int argc, char *argv[])
if (vm_late_init(&vm) < 0)
return -1;

/* Switch the terminal to raw mode only once setup has succeeded so that
* any error from the load/init paths above is rendered on a normal tty.
/* Lock down the syscall surface before raw-mode and vm_run, so a
* memory-corruption RCE in device emulation cannot escape to arbitrary host
* syscalls. Off by default — opt in via --seccomp.
*/
if (enable_seccomp && seccomp_apply() < 0)
return -1;

/* Switch the terminal to raw mode only once setup has succeeded so that any
* error from the load/init paths above is rendered on a normal tty.
*/
set_input_mode();

Expand Down
Loading