diff --git a/.ci/common.sh b/.ci/common.sh
index 0aab566e..e6f88cf3 100755
--- a/.ci/common.sh
+++ b/.ci/common.sh
@@ -7,9 +7,6 @@ set -euo pipefail
 MACHINE_TYPE="$(uname -m)"
 OS_TYPE="$(uname -s)"
 
-# Enable SDL headless mode explicitly.
-export SDL_VIDEODRIVER=offscreen
-
 # Cleanup function - kills all semu processes
 cleanup() {
     sleep 1
diff --git a/.ci/publish-prebuilt.sh b/.ci/publish-prebuilt.sh
index 2eed5784..d4eeaeaf 100755
--- a/.ci/publish-prebuilt.sh
+++ b/.ci/publish-prebuilt.sh
@@ -1,22 +1,24 @@
 #!/usr/bin/env bash
 #
-# Compress the prebuilt Image and rootfs.cpio in cwd, write a sha1
-# manifest, hash the input files that define the prebuilt's contents,
-# and print all three sums in KEY=VAL form on stdout so callers can
-# splice them into release notes, GITHUB_OUTPUT, or whatever else.
+# Compress the prebuilt Image, rootfs.cpio, and test-tools.img in cwd, write a
+# sha1 manifest, hash the input files that define the prebuilt's contents, and
+# print all four sums in KEY=VAL form on stdout so callers can splice them into
+# release notes, GITHUB_OUTPUT, or whatever else.
 #
 # Inputs (in cwd):
 #   Image
 #   rootfs.cpio
+#   test-tools.img
 #   plus the source inputs listed in INPUTS below (config + scripts +
-#   target/init that define the buildroot/kernel content)
+#   target files that define the prebuilt content)
 #
 # Outputs (in cwd):
 #   Image.bz2
 #   rootfs.cpio.bz2
-#   prebuilt.sha1   -- three-line manifest in sha1sum format. The
-#                      first two lines verify the published archives;
-#                      the third uses the virtual name 'inputs' to
+#   test-tools.img.bz2
+#   prebuilt.sha1   -- four-line manifest in sha1sum format. The
+#                      first three lines verify the published archives;
+#                      the fourth uses the virtual name 'inputs' to
 #                      publish the SHA-1 of the concatenated input
 #                      files so drift-detection consumers can read it
 #                      directly from the release.
@@ -24,6 +26,7 @@
 # Stdout (machine-readable, one assignment per line):
 #   kernel_sha1=<sha1 of Image.bz2>
 #   initrd_sha1=<sha1 of rootfs.cpio.bz2>
+#   test_tools_sha1=<sha1 of test-tools.img.bz2>
 #   inputs_sha1=<sha1 of the concatenated input files>
 
 set -euo pipefail
@@ -45,35 +48,41 @@ INPUTS=(
     configs/linux.config
     configs/busybox.config
     configs/buildroot.config
+    configs/x11.config
+    configs/riscv-cross-file
     scripts/build-image.sh
     scripts/rootfs_ext4.sh
     target/init
+    target/local-env.sh
 )
 
-for f in Image rootfs.cpio "${INPUTS[@]}"; do
+for f in Image rootfs.cpio test-tools.img "${INPUTS[@]}"; do
     if [ ! -f "$f" ]; then
-        echo "[!] Missing $f -- run scripts/build-image.sh --all first" >&2
+        echo "[!] Missing $f -- run scripts/build-image.sh --all --x11 --directfb2-test first" >&2
         exit 1
     fi
 done
 
 bzip2 -k -f Image
 bzip2 -k -f rootfs.cpio
+bzip2 -k -f test-tools.img
 
 KERNEL_SHA1=$("${SHA1[@]}" Image.bz2       | awk '{print $1}')
 INITRD_SHA1=$("${SHA1[@]}" rootfs.cpio.bz2 | awk '{print $1}')
+TEST_TOOLS_SHA1=$("${SHA1[@]}" test-tools.img.bz2 | awk '{print $1}')
 # Concatenate inputs in deterministic order and hash the stream. Matches
 # the make-time computation in mk/external.mk so they compare directly.
 INPUTS_SHA1=$(cat "${INPUTS[@]}" | "${SHA1[@]}" | awk '{print $1}')
 
-# Write the manifest. The first two lines match 'sha1sum -c' format for
-# the real archives; the third line uses the virtual filename 'inputs'
+# Write the manifest. The first three lines match 'sha1sum -c' format for
+# the real archives; the fourth line uses the virtual filename 'inputs'
 # to publish the input-fingerprint hash so consumers (mk/external.mk's
 # drift warning, .github/workflows/main.yml's PR drift detection) can
 # read it from the release without parsing the release-body markdown.
 {
     echo "$KERNEL_SHA1  Image.bz2"
     echo "$INITRD_SHA1  rootfs.cpio.bz2"
+    echo "$TEST_TOOLS_SHA1  test-tools.img.bz2"
     echo "$INPUTS_SHA1  inputs"
 } > prebuilt.sha1
 
@@ -86,4 +95,5 @@ INPUTS_SHA1=$(cat "${INPUTS[@]}" | "${SHA1[@]}" | awk '{print $1}')
 
 echo "kernel_sha1=$KERNEL_SHA1"
 echo "initrd_sha1=$INITRD_SHA1"
+echo "test_tools_sha1=$TEST_TOOLS_SHA1"
 echo "inputs_sha1=$INPUTS_SHA1"
diff --git a/.ci/test-gpu.sh b/.ci/test-gpu.sh
new file mode 100755
index 00000000..cda6762d
--- /dev/null
+++ b/.ci/test-gpu.sh
@@ -0,0 +1,185 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+. "${SCRIPT_DIR}/common.sh"
+
+# Override timeout and sleep duration for macOS - emulation is significantly slower
+case "${OS_TYPE}" in
+    Darwin)
+        TIMEOUT=10800
+        DFB_SLEEP=180
+        ;;
+    *)
+        DFB_SLEEP=5
+        ;;
+esac
+export DFB_SLEEP
+SEMU_DIRECTFB2_TEST="${SEMU_DIRECTFB2_TEST:-1}"
+export SEMU_DIRECTFB2_TEST
+MAKE_CHECK_DISKIMG_ARG=""
+
+cleanup
+trap cleanup EXIT
+
+# Feature toggles are passed through environment variables, which do not
+# participate in normal dependency tracking by 'make'. Force a rebuild here so
+# one-feature-at-a-time test runs never reuse a stale 'semu' binary or DTB.
+make -B semu minimal.dtb
+
+if [ ! -f Image ] || [ ! -f rootfs.cpio ]; then
+    make Image rootfs.cpio
+fi
+if [[ "${SEMU_DIRECTFB2_TEST}" == "1" ]]; then
+    # The default ext4.img is intentionally small. DirectFB2 lives in the
+    # optional test tools disk, which is supplied by PR-built artifacts or downloaded
+    # like the other prebuilt artifacts.
+    if [ ! -f test-tools.img ]; then
+        make test-tools.img
+    fi
+    MAKE_CHECK_DISKIMG_ARG="DISKIMG_FILE=test-tools.img"
+elif [ ! -f ext4.img ]; then
+    make ext4.img
+fi
+export MAKE_CHECK_DISKIMG_ARG
+
+# NOTE: We want to capture the 'expect' exit code and map
+# it to our 'MESSAGES' array for meaningful error output.
+# Temporarily disable 'errexit' for the 'expect' call.
+set +e
+expect <<'DONE'
+set timeout $env(TIMEOUT)
+if {$env(MAKE_CHECK_DISKIMG_ARG) eq ""} {
+  spawn make check
+} else {
+  spawn make check $env(MAKE_CHECK_DISKIMG_ARG)
+}
+
+# Boot and login
+expect "buildroot login:" { send "root\r" } timeout { exit 1 }
+expect "# "              { send "uname -a\r" } timeout { exit 2 }
+expect "riscv32 GNU/Linux" {}
+
+# ---------------- virtio-gpu basic checks ----------------
+expect "# " { send "ls -la /dev/dri/ 2>/dev/null || true\r" }
+# Emit a shell-expanded status marker so 'expect' cannot match the echoed command.
+expect "# " { send "if test -c /dev/dri/card0; then status=OK; else status=MISSING; fi; printf \"__VGPU_DRM_%s__\\n\" \"\$status\"\r" } timeout { exit 3 }
+expect {
+  -exact "__VGPU_DRM_OK__" {}
+  -exact "__VGPU_DRM_MISSING__" { exit 3 }
+  timeout { exit 3 }
+}
+
+# virtio transport may be 'virtio-mmio', binding check should look at the
+# 'virtio_gpu' driver directory.
+expect "# " {
+  send "sh -lc 'if ls /sys/bus/virtio/drivers/virtio_gpu/virtio* >/dev/null 2>&1; then status=OK; else status=BAD; fi; printf \"__VGPU_BIND_%s__\\n\" \"\u0024status\"'\r"
+} timeout { exit 3 }
+expect {
+  -exact "__VGPU_BIND_OK__" {}
+  -exact "__VGPU_BIND_BAD__" {
+    send "ls -l /sys/bus/virtio/drivers/virtio_gpu/ 2>/dev/null || true\r"
+    # Emit literal '$d' via '\u0024' to avoid Tcl variable substitution.
+    send "sh -lc 'for d in /sys/bus/virtio/devices/virtio*; do echo \u0024d; ls -l \u0024d/driver 2>/dev/null || true; done'\r"
+    exit 3
+  }
+  timeout { exit 3 }
+}
+
+# Useful logs (non-fatal)
+expect "# " { send "dmesg | grep -Ei 'virtio.*gpu|drm.*virtio|scanout|number of scanouts' | tail -n 80 || true\r" }
+
+if {$env(SEMU_DIRECTFB2_TEST) ne "1"} {
+  exit 0
+}
+
+# ---------------- DirectFB2 ----------------
+# Strategy:
+# 1) Stop X11 if running (it holds the DRM device)
+# 2) Check 'local-env.sh' exists at '/root/local-env.sh'
+# 3) Source 'local-env.sh' to set 'PATH'/'LD_LIBRARY_PATH'
+# 4) Verify 'df_drivertest' is in 'PATH'
+# 5) Run 'df_drivertest' and check for DirectFB init messages
+#
+# NOTE: 'df_drivertest' may segfault when killed due to a race condition in
+# DirectFB2's fusion module ('libfusion') during signal handling. When 'SIGTERM'
+# is sent, the signal handler starts cleanup while the "Fusion Dispatch" thread
+# may still be accessing shared state, leading to a use-after-free crash. The
+# test passes if DirectFB init messages appear, even if the program crashes
+# afterward during cleanup.
+
+# Step 0: Stop X11 to release DRM device (it holds '/dev/dri/card0')
+# Use 'pidof' with fallback to 'ps'/'grep' if 'pidof' is unavailable.
+expect "# " {
+  send "sh -lc '\
+    if command -v pidof >/dev/null 2>&1; then \
+      pidof Xorg >/dev/null 2>&1 && kill \u0024(pidof Xorg) 2>/dev/null || true; \
+    else \
+      ps | grep Xorg | grep -v grep | awk \"{print \u00241}\" | xargs kill 2>/dev/null || true; \
+    fi; \
+    sleep 1; printf \"__X11_%s__\\n\" STOPPED'\r"
+}
+expect {
+  -exact "__X11_STOPPED__" {}
+  timeout { exit 4 }
+}
+
+# Step 1: Check 'local-env.sh' exists.
+expect "# " { send "if test -f /root/local-env.sh; then status=OK; else status=MISSING; fi; printf \"__LOCALENV_%s__\\n\" \"\$status\"\r" }
+expect {
+  -exact "__LOCALENV_OK__" {}
+  -exact "__LOCALENV_MISSING__" { exit 4 }
+  timeout { exit 4 }
+}
+
+# Step 2: Source 'local-env.sh'.
+expect "# " { send "if . /root/local-env.sh >/dev/null 2>&1; then status=DONE; else status=FAIL; fi; printf \"__SRC_%s__\\n\" \"\$status\"\r" }
+expect {
+  -exact "__SRC_DONE__" {}
+  -exact "__SRC_FAIL__" { exit 4 }
+  timeout { exit 4 }
+}
+
+# Step 3: Verify 'df_drivertest' is available.
+expect "# " { send "if command -v df_drivertest >/dev/null 2>&1; then status=OK; else status=MISS; fi; printf \"__APP_%s__\\n\" \"\$status\"\r" }
+expect {
+  -exact "__APP_OK__" {}
+  -exact "__APP_MISS__" { exit 4 }
+  timeout { exit 4 }
+}
+
+# Step 4: Run 'df_drivertest' and check output (run in background, kill after
+# delay).
+expect "# " { send "df_drivertest >/tmp/dfb.log 2>&1 & sleep $env(DFB_SLEEP); kill \u0024! 2>/dev/null; head -30 /tmp/dfb.log\r" }
+# Check for 'DRMKMS' init message.
+expect "# " { send "if grep -qi 'DRMKMS/System' /tmp/dfb.log; then status=OK; else status=FAIL; fi; printf \"__DFB_%s__\\n\" \"\$status\"\r" }
+expect {
+  -exact "__DFB_OK__" {}
+  -exact "__DFB_FAIL__" { exit 4 }
+  timeout { exit 4 }
+}
+DONE
+
+ret="$?"
+set -e  # Re-enable 'errexit' after capturing 'expect' return code.
+
+if [[ "${ret}" -eq 0 ]]; then
+  if [[ "${SEMU_DIRECTFB2_TEST}" == "1" ]]; then
+    print_success "PASS: headless virtio-gpu + DirectFB2 checks"
+  else
+    print_success "PASS: headless virtio-gpu checks"
+  fi
+  exit 0
+fi
+
+MESSAGES=(
+  "unused"
+  "FAIL: boot/login prompt not found"
+  "FAIL: shell prompt not found"
+  "FAIL: virtio-gpu basic checks failed (/dev/dri/card0 or virtio_gpu binding)"
+  "FAIL: DirectFB2 check failed (local-env.sh/df_drivertest missing or no DRMKMS init messages)"
+)
+
+print_error "${MESSAGES[${ret}]:-FAIL: unknown error (exit code ${ret})}"
+exit "${ret}"
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 2681524e..a651fde5 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -10,8 +10,8 @@ permissions:
   contents: read
 
 jobs:
-  # PR-only: rebuild Image/rootfs.cpio from source when local kernel
-  # and rootfs inputs have drifted from the prebuilt's recorded inputs.
+  # PR-only: rebuild or restore Image/rootfs.cpio/test-tools.img when local
+  # guest-artifact inputs have drifted from the prebuilt's recorded inputs.
   # Without this, make check on a PR that touches configs, scripts, or
   # target would silently exercise the stale prebuilt instead of the
   # contributor's actual change.
@@ -60,30 +60,45 @@ jobs:
           configs/linux.config \
           configs/busybox.config \
           configs/buildroot.config \
+          configs/x11.config \
+          configs/riscv-cross-file \
           scripts/build-image.sh \
           scripts/rootfs_ext4.sh \
           target/init \
+          target/local-env.sh \
           | sha1sum | awk '{print $1}')
+        echo "live_hash=$live" >> "$GITHUB_OUTPUT"
         if [ "$live" = "$expected" ]; then
           echo "PR inputs match the prebuilt ($live); skipping rebuild"
           echo "should_build=false" >> "$GITHUB_OUTPUT"
         else
-          echo "PR inputs drifted ($live != $expected); will rebuild from source"
+          echo "PR inputs drifted ($live != $expected); will use PR artifacts"
           echo "should_build=true" >> "$GITHUB_OUTPUT"
         fi
-    - name: install build dependencies
+    - name: cache PR-built artifacts
       if: steps.detect.outputs.should_build == 'true'
+      id: pr_artifact_cache
+      uses: actions/cache@v4
+      with:
+        path: |
+          Image
+          rootfs.cpio
+          test-tools.img
+        key: pr-prebuilt-${{ runner.os }}-${{ steps.detect.outputs.live_hash }}
+    - name: install build dependencies
+      if: steps.detect.outputs.should_build == 'true' && steps.pr_artifact_cache.outputs.cache-hit != 'true'
       run: |
         sudo apt-get update
         sudo DEBIAN_FRONTEND=noninteractive apt-get install -y \
           build-essential \
           bc bison flex cpio fakeroot e2fsprogs \
-          git python3 libssl-dev libelf-dev wget
+          git python3 libssl-dev libelf-dev wget \
+          meson ninja-build pkg-config
       timeout-minutes: 5
-    - name: build kernel and rootfs from source
-      if: steps.detect.outputs.should_build == 'true'
-      run: ./scripts/build-image.sh --all
-      timeout-minutes: 90
+    - name: build kernel, rootfs, and test tools disk from source
+      if: steps.detect.outputs.should_build == 'true' && steps.pr_artifact_cache.outputs.cache-hit != 'true'
+      run: ./scripts/build-image.sh --all --x11 --directfb2-test
+      timeout-minutes: 180
     - name: upload PR-built artifacts
       if: steps.detect.outputs.should_build == 'true'
       uses: actions/upload-artifact@v4
@@ -92,6 +107,7 @@ jobs:
         path: |
           Image
           rootfs.cpio
+          test-tools.img
         retention-days: 1
         if-no-files-found: error
 
@@ -110,16 +126,16 @@ jobs:
       uses: actions/checkout@v4
       with:
         submodules: recursive
-    # Drift PR: pull the freshly-built Image/rootfs.cpio from
+    # Drift PR: pull the freshly-built Image/rootfs.cpio/test-tools.img from
     # pr-prebuilt-build via workflow artifact (reliable, not cache).
-    - name: download PR-built kernel/rootfs
+    - name: download PR-built external artifacts
       if: needs.pr-prebuilt-build.outputs.should_build == 'true'
       uses: actions/download-artifact@v4
       with:
         name: prebuilt-pr
     # Non-drift PR or master push: cache the release-downloaded artifacts
     # across runs. Include mk/external.mk so checksum or input-pin bumps
-    # after a republish invalidate the old Image/rootfs.cpio pair.
+    # after a republish invalidate the old external artifacts.
     - name: cache external downloads
       if: needs.pr-prebuilt-build.outputs.should_build != 'true'
       uses: actions/cache@v4
@@ -127,7 +143,8 @@ jobs:
         path: |
           Image
           rootfs.cpio
-        key: external-${{ hashFiles('mk/external.mk', 'configs/linux.config', 'configs/busybox.config', 'configs/buildroot.config', 'scripts/build-image.sh', 'scripts/rootfs_ext4.sh', 'target/**') }}
+          test-tools.img
+        key: external-${{ hashFiles('mk/external.mk', 'configs/linux.config', 'configs/busybox.config', 'configs/buildroot.config', 'configs/x11.config', 'scripts/build-image.sh', 'scripts/rootfs_ext4.sh', 'target/**') }}
     - name: cache submodule builds
       uses: actions/cache@v4
       with:
@@ -171,6 +188,10 @@ jobs:
       run: .ci/test-vinput.sh
       shell: bash
       timeout-minutes: 5
+    - name: virtio-gpu test
+      run: .ci/test-gpu.sh
+      shell: bash
+      timeout-minutes: 10
 
   # Guard the legacy initramfs path so it does not bitrot now that the
   # default boot mode is /dev/vda. Single slim job: fresh build with
@@ -183,7 +204,7 @@ jobs:
       uses: actions/checkout@v4
       with:
         submodules: recursive
-    - name: download PR-built kernel/rootfs
+    - name: download PR-built external artifacts
       if: needs.pr-prebuilt-build.outputs.should_build == 'true'
       uses: actions/download-artifact@v4
       with:
@@ -195,7 +216,8 @@ jobs:
         path: |
           Image
           rootfs.cpio
-        key: external-${{ hashFiles('mk/external.mk', 'configs/linux.config', 'configs/busybox.config', 'configs/buildroot.config', 'scripts/build-image.sh', 'scripts/rootfs_ext4.sh', 'target/**') }}
+          test-tools.img
+        key: external-${{ hashFiles('mk/external.mk', 'configs/linux.config', 'configs/busybox.config', 'configs/buildroot.config', 'configs/x11.config', 'scripts/build-image.sh', 'scripts/rootfs_ext4.sh', 'target/**') }}
     - name: cache submodule builds
       uses: actions/cache@v4
       with:
@@ -234,7 +256,7 @@ jobs:
     # pr-prebuilt-build's artifact (built on a linux runner) -- a cache
     # miss here would silently fall back to downloading the stale
     # release, defeating the whole point of the drift-detection logic.
-    - name: download PR-built kernel/rootfs
+    - name: download PR-built external artifacts
       if: needs.pr-prebuilt-build.outputs.should_build == 'true'
       uses: actions/download-artifact@v4
       with:
@@ -246,7 +268,8 @@ jobs:
         path: |
           Image
           rootfs.cpio
-        key: external-${{ hashFiles('mk/external.mk', 'configs/linux.config', 'configs/busybox.config', 'configs/buildroot.config', 'scripts/build-image.sh', 'scripts/rootfs_ext4.sh', 'target/**') }}
+          test-tools.img
+        key: external-${{ hashFiles('mk/external.mk', 'configs/linux.config', 'configs/busybox.config', 'configs/buildroot.config', 'configs/x11.config', 'scripts/build-image.sh', 'scripts/rootfs_ext4.sh', 'target/**') }}
     - name: cache submodule builds
       uses: actions/cache@v4
       with:
@@ -288,6 +311,10 @@ jobs:
       run: .ci/test-vinput.sh
       shell: bash
       timeout-minutes: 20
+    - name: virtio-gpu test
+      run: .ci/test-gpu.sh
+      shell: bash
+      timeout-minutes: 20
 
   coding_style:
     runs-on: ubuntu-24.04
diff --git a/.github/workflows/prebuilt.yml b/.github/workflows/prebuilt.yml
index e22e600e..5b52ac44 100644
--- a/.github/workflows/prebuilt.yml
+++ b/.github/workflows/prebuilt.yml
@@ -1,9 +1,9 @@
 name: Publish prebuilt images
 
-# Builds the Linux kernel and Buildroot rootfs that the rest of CI and
-# make on a fresh checkout consumes, then publishes them as assets on a
-# fixed-tag GitHub prerelease so the download URL stays stable across
-# rebuilds, keeping large binary artifacts out of the source tree.
+# Builds the Linux kernel, Buildroot rootfs, and optional test tools disk that the
+# rest of CI and make on a fresh checkout consume, then publishes them as
+# assets on a fixed-tag GitHub prerelease so the download URL stays stable
+# across rebuilds, keeping large binary artifacts out of the source tree.
 #
 # Triggers automatically on master pushes that touch any input listed
 # in the paths filter below, and can be invoked manually via
@@ -23,6 +23,8 @@ on:
       - 'configs/linux.config'
       - 'configs/busybox.config'
       - 'configs/buildroot.config'
+      - 'configs/x11.config'
+      - 'configs/riscv-cross-file'
       - 'scripts/build-image.sh'
       - 'scripts/rootfs_ext4.sh'
       - 'target/**'
@@ -59,10 +61,13 @@ jobs:
             python3 \
             libssl-dev \
             libelf-dev \
-            wget
+            wget \
+            meson \
+            ninja-build \
+            pkg-config
 
-      - name: Build Buildroot and Linux
-        run: ./scripts/build-image.sh --all
+      - name: Build Buildroot, Linux, and test tools disk
+        run: ./scripts/build-image.sh --all --x11 --directfb2-test
 
       - name: Compress and checksum artifacts
         id: checksum
@@ -71,7 +76,7 @@ jobs:
         run: |
           set -euo pipefail
           .ci/publish-prebuilt.sh >> "$GITHUB_OUTPUT"
-          ls -la Image.bz2 rootfs.cpio.bz2 prebuilt.sha1
+          ls -la Image.bz2 rootfs.cpio.bz2 test-tools.img.bz2 prebuilt.sha1
 
       - name: Update prebuilt prerelease
         uses: softprops/action-gh-release@v2
@@ -85,11 +90,13 @@ jobs:
           files: |
             Image.bz2
             rootfs.cpio.bz2
+            test-tools.img.bz2
             prebuilt.sha1
           body: |
-            Rolling prerelease of the Linux kernel and Buildroot rootfs
-            consumed by `mk/external.mk`. Re-published whenever any
-            input that defines the kernel/rootfs content changes.
+            Rolling prerelease of the Linux kernel, Buildroot rootfs,
+            and optional test tools disk consumed by `mk/external.mk`.
+            Re-published whenever any input that defines the prebuilt
+            content changes.
 
             All checksums (archive hashes plus the source-input
             fingerprint used for drift detection) are published in the
@@ -101,5 +108,6 @@ jobs:
             ```
             ${{ steps.checksum.outputs.kernel_sha1 }}  Image.bz2
             ${{ steps.checksum.outputs.initrd_sha1 }}  rootfs.cpio.bz2
-            ${{ steps.checksum.outputs.inputs_sha1 }}  inputs (configs + scripts + target/init, concatenated)
+            ${{ steps.checksum.outputs.test_tools_sha1 }}  test-tools.img.bz2
+            ${{ steps.checksum.outputs.inputs_sha1 }}  inputs (configs + scripts + target files, concatenated)
             ```
diff --git a/.gitignore b/.gitignore
index f9f8f9d3..f387cb41 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,14 +7,23 @@ semu
 *.dtb
 Image
 ext4.img
+test-tools.img
 rootfs.cpio
 prebuilt.sha1
 
 # intermediate
 riscv-harts.dtsi
 .smp_stamp
+.dtb-config.stamp
+.build-config.stamp
 
 # Build directories
 buildroot/
 linux/
 rootfs/
+directfb/
+extra_packages/
+
+# DirectFB build
+DirectFB2/
+DirectFB-examples/
diff --git a/Makefile b/Makefile
index 10fad261..cd3347a8 100644
--- a/Makefile
+++ b/Makefile
@@ -191,7 +191,8 @@ ifeq ($(ENABLE_SDL),1)
     CFLAGS += $(shell sdl2-config --cflags)
     LDFLAGS += $(shell sdl2-config --libs)
 else
-    # Disable virtio-input if SDL is not set
+    # Disable window-backed virtio devices if SDL is not set.
+    override ENABLE_VIRTIOGPU := 0
     override ENABLE_VIRTIOINPUT := 0
 endif
 
@@ -203,6 +204,18 @@ $(call set-feature, VIRTIOINPUT)
 ifeq ($(call has, VIRTIOINPUT), 1)
     OBJS_EXTRA += virtio-input-event.o
     OBJS_EXTRA += virtio-input.o
+endif
+
+# virtio-gpu
+ENABLE_VIRTIOGPU ?= 1
+$(call set-feature, VIRTIOGPU)
+ifeq ($(call has, VIRTIOGPU), 1)
+    OBJS_EXTRA += virtio-gpu.o
+    OBJS_EXTRA += virtio-gpu-sw.o
+    OBJS_EXTRA += vgpu-display.o
+endif
+
+ifneq ($(filter 1,$(call has, VIRTIOGPU) $(call has, VIRTIOINPUT)),)
     OBJS_EXTRA += window-sw.o
 endif
 
@@ -311,7 +324,7 @@ minimal.dtb: minimal.dts riscv-harts.dtsi .dtb-config.stamp
 .PHONY: FORCE
 FORCE:
 
-# Rules for downloading prebuilt Linux kernel image
+# Rules for downloading prebuilt guest artifacts
 include mk/external.mk
 
 ifeq ($(call has, EXTERNAL_ROOT), 1)
@@ -347,8 +360,9 @@ check: $(BIN) minimal.dtb $(KERNEL_DATA) $(INITRD_DEP) $(DISKIMG_FILE) $(SHARED_
 	@$(call notice, Ready to launch Linux kernel. Please be patient.)
 	$(Q)./$(BIN) -k $(KERNEL_DATA) -c $(SMP) -b minimal.dtb -H $(INITRD_OPT) $(if $(NETDEV),-n $(NETDEV)) $(OPTS)
 
+BUILD_IMAGE_ARGS ?= --all
 build-image:
-	scripts/build-image.sh
+	scripts/build-image.sh $(BUILD_IMAGE_ARGS)
 
 clean:
 	$(Q)$(RM) $(BIN) $(OBJS) $(deps)
@@ -363,6 +377,6 @@ distclean: clean
 	$(Q)$(RM) .dtb-config.stamp
 	$(Q)$(RM) .build-config.stamp
 	$(Q)$(RM) Image rootfs.cpio prebuilt.sha1
-	$(Q)$(RM) ext4.img
+	$(Q)$(RM) ext4.img test-tools.img
 
 -include $(deps)
diff --git a/README.md b/README.md
index fd860103..e0ede423 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ A minimalist RISC-V system emulator capable of running Linux the kernel and corr
 - UART: 8250/16550
 - PLIC (platform-level interrupt controller): 32 interrupts, no priority
 - Standard SBI, with the timer extension
-- Four types of I/O support using VirtIO standard:
+- I/O support using VirtIO standard:
     - virtio-blk acquires disk image from the host.
     - virtio-net is mapped as TAP interface.
     - virtio-snd uses [PortAudio](https://github.com/PortAudio/portaudio) for sound playback on the host with one limitations:
@@ -19,7 +19,11 @@ A minimalist RISC-V system emulator capable of running Linux the kernel and corr
             - For instance, the following buffer/period size settings on `aplay` has been tested
               with broken and stutter effects yet complete with no any errors: `aplay --buffer-size=32768 --period-size=4096 /usr/share/sounds/alsa/Front_Center.wav`.
     - virtio-input exposes SDL-backed keyboard and mouse devices to the guest.
-      - You can exit the SDL window by pressing Ctrl+A+G
+    - virtio-gpu exposes a minimal 2D DRM/KMS device to the guest. Linux can
+      bind the `virtio_gpu` driver and create `/dev/dri/card0`.
+      - Only 2D scanout is currently supported; 3D, virgl, and blob resources
+        are not implemented yet.
+    - Press Ctrl+Alt+G to release the mouse cursor from the SDL window.
 
 ## Prerequisites
 
@@ -54,11 +58,21 @@ Build the emulator:
 $ make
 ```
 
-Download prebuilt Linux kernel image:
+Download the prebuilt guest artifacts and run the default check:
 ```shell
 $ make check
 ```
 
+With the default external-root build, `make check` uses `Image`, `minimal.dtb`,
+and `ext4.img`, and boots `semu` headlessly with an equivalent command line:
+
+```shell
+$ ./semu -k Image -c 1 -b minimal.dtb -H -d ext4.img
+```
+
+If `ENABLE_EXTERNAL_ROOT=0` is used, `make check` switches to the legacy
+initramfs path and passes `-i rootfs.cpio` instead of `-d ext4.img`.
+
 Please be patient while `semu` is running.
 
 Reference output:
@@ -76,6 +90,32 @@ Enter `root` to access shell.
 
 You can exit the emulator using: \<Ctrl-a x\>. (press Ctrl+A, leave it, afterwards press X)
 
+To test virtio-gpu with a visible SDL window, run `semu` manually without `-H`.
+Make sure `sdl2-config` is in `PATH`, then build the emulator, DTB, kernel, and
+test tools disk. Press `Ctrl+Alt+G` to release the mouse cursor from the SDL
+window:
+
+```shell
+$ sdl2-config --version
+$ make semu minimal.dtb Image test-tools.img
+$ ./semu -k Image -c 1 -b minimal.dtb -d test-tools.img
+```
+
+Log in as `root`, source the test-tools image environment, and run one of
+the DirectFB2 examples:
+
+```
+# . /root/local-env.sh
+# df_drivertest
+```
+
+The installed DirectFB2 examples come from the upstream DirectFB-examples
+project and can be listed in the guest with:
+
+```
+# ls /usr/local/bin/df_*
+```
+
 ## Usage
 
 ```shell
@@ -104,6 +144,10 @@ unpacking a large cpio, and matches how real systems deploy. The
 `ext4.img` is built from `rootfs.cpio` via `scripts/rootfs_ext4.sh`,
 which requires `fakeroot` and `mkfs.ext4`.
 
+The rolling `prebuilt` release provides an optional `test-tools.img.bz2` for
+larger test/user tools that should not inflate `rootfs.cpio` or the default
+`ext4.img`. Use `make test-tools.img` to download it.
+
 If `fakeroot` is missing, the build falls back to the legacy initramfs
 path (`-i rootfs.cpio`) automatically and prints a one-line warning. To
 force the legacy path explicitly:
@@ -164,21 +208,23 @@ To build everything, simply run:
 $ make build-image
 ```
 
-This command invokes the underlying script: `scripts/build-image.sh`, which also offers more flexible usage options.
+This command invokes the underlying script: `scripts/build-image.sh --all`, which also offers more flexible usage options.
 
 ### Script Usage
 
 ```
-./scripts/build-image.sh [--buildroot] [--linux] [--all] [--no-ext4] [--clean-build] [--help]
+./scripts/build-image.sh [--buildroot] [--linux] [--directfb2-test] [--all] [--no-ext4] [--clean-build] [--help]
 
 Options:
   --buildroot         Build Buildroot userland (produces rootfs.cpio and,
                       unless --no-ext4 is given, ext4.img for vda boot)
+  --directfb2-test    Build test-tools.img with the DirectFB2 test payload
   --linux             Build the Linux kernel
   --all               Build both Buildroot and Linux
   --no-ext4           Skip ext4.img generation; produce only rootfs.cpio
                       (matches the legacy ENABLE_EXTERNAL_ROOT=0 path)
-  --clean-build       Remove buildroot/ and/or linux/ before building
+  --clean-build       Remove buildroot/ and/or linux/ before building;
+                      with --directfb2-test, also remove DirectFB2 build outputs
   --help              Show this message
 ```
 
@@ -202,6 +248,28 @@ Build Buildroot for the legacy initramfs-only path (no ext4):
 $ scripts/build-image.sh --buildroot --no-ext4
 ```
 
+`test-tools.img` is the shared optional disk for test payloads that should
+not live in the default `rootfs.cpio` or `ext4.img`. This keeps the default
+guest image small while still allowing larger tools to be collected in one
+place.
+
+Build Buildroot and the test tools image with the DirectFB2 test payload. Add
+`--x11` when the test tools image should use an X11-enabled rootfs:
+
+```
+$ scripts/build-image.sh --x11 --directfb2-test
+```
+
+To add a new test tool, extend the `test-tools.img` build path in
+`scripts/build-image.sh` so the tool is staged into `extra_packages`, then
+update `target/local-env.sh` if the tool needs an additional binary or library
+search path.
+
+The build script copies `target/local-env.sh` to `/root/local-env.sh` in the
+test tools image. After booting the VM, source it once to pick up paths such
+as `/usr/local/bin` and `/usr/local/lib`, instead of running overlaid tools
+through full paths like `/usr/local/bin/df_*`.
+
 Force a clean build:
 
 ```
diff --git a/configs/buildroot.config b/configs/buildroot.config
index 315fccf5..6c9b179d 100644
--- a/configs/buildroot.config
+++ b/configs/buildroot.config
@@ -39,6 +39,8 @@ BR2_FORTIFY_SOURCE_1=y
 BR2_PACKAGE_ALSA_UTILS=y
 BR2_PACKAGE_ALSA_UTILS_APLAY=y
 BR2_PACKAGE_ALSA_UTILS_SPEAKER_TEST=y
+BR2_PACKAGE_LIBDRM=y
+# BR2_PACKAGE_LIBDRM_INSTALL_TESTS is not set
 # BR2_PACKAGE_URANDOM_SCRIPTS is not set
 BR2_TARGET_ROOTFS_CPIO=y
 BR2_TARGET_ROOTFS_CPIO_FULL=y
diff --git a/configs/linux.config b/configs/linux.config
index 3adeccda..0a32ab3c 100644
--- a/configs/linux.config
+++ b/configs/linux.config
@@ -911,13 +911,19 @@ CONFIG_MFD_SYSCON=y
 #
 # Graphics support
 #
-# CONFIG_DRM is not set
+CONFIG_DRM=y
+CONFIG_DRM_KMS_HELPER=y
 # CONFIG_DRM_DEBUG_MODESET_LOCK is not set
 
 #
 # ARM devices
 #
 # end of ARM devices
+CONFIG_DRM_VIRTIO_GPU=y
+CONFIG_DRM_VIRTIO_GPU_KMS=y
+CONFIG_DRM_PANEL=y
+CONFIG_DRM_BRIDGE=y
+CONFIG_DRM_PANEL_BRIDGE=y
 
 #
 # Frame buffer Devices
@@ -1056,6 +1062,7 @@ CONFIG_VIRTIO_MENU=y
 CONFIG_VIRTIO_INPUT=y
 CONFIG_VIRTIO_MMIO=y
 # CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES is not set
+CONFIG_VIRTIO_DMA_SHARED_BUFFER=y
 # CONFIG_VDPA is not set
 # CONFIG_VHOST_MENU is not set
 
diff --git a/configs/riscv-cross-file b/configs/riscv-cross-file
new file mode 100644
index 00000000..668f91f5
--- /dev/null
+++ b/configs/riscv-cross-file
@@ -0,0 +1,18 @@
+[binaries]
+    c = 'riscv32-buildroot-linux-gnu-gcc'
+    strip = 'riscv32-buildroot-linux-gnu-strip'
+    pkgconfig = 'pkg-config'
+    python = '/usr/bin/python3'
+
+[properties]
+    pkg_config_libdir = ['@GLOBAL_SOURCE_ROOT@' / '../buildroot/output/host/riscv32-buildroot-linux-gnu/sysroot/usr/local/lib/pkgconfig',
+                         '@GLOBAL_SOURCE_ROOT@' / '../buildroot/output/host/riscv32-buildroot-linux-gnu/sysroot/usr/share/pkgconfig/',
+                         '@GLOBAL_SOURCE_ROOT@' / '../buildroot/output/host/riscv32-buildroot-linux-gnu/sysroot/usr/lib/pkgconfig/'
+                        ]
+    sys_root = '@GLOBAL_SOURCE_ROOT@' / '../buildroot/output/host/riscv32-buildroot-linux-gnu/sysroot'
+
+[host_machine]
+    system = 'linux'
+    cpu_family = 'riscv32'
+    cpu = 'riscv32-ima'
+    endian = 'little'
diff --git a/configs/x11.config b/configs/x11.config
new file mode 100644
index 00000000..3cab4aac
--- /dev/null
+++ b/configs/x11.config
@@ -0,0 +1,35 @@
+BR2_TOOLCHAIN_BUILDROOT_CXX=y
+BR2_INSTALL_LIBSTDCPP=y
+BR2_PACKAGE_GLMARK2=y
+BR2_PACKAGE_KMSCUBE=y
+BR2_PACKAGE_MESA3D_DEMOS=y
+BR2_PACKAGE_MESA3D=y
+BR2_PACKAGE_MESA3D_GALLIUM_DRIVER=y
+BR2_PACKAGE_MESA3D_DRIVER=y
+BR2_PACKAGE_MESA3D_NEEDS_X11=y
+BR2_PACKAGE_MESA3D_GALLIUM_DRIVER_SWRAST=y
+BR2_PACKAGE_MESA3D_GALLIUM_DRIVER_VIRGL=n
+BR2_PACKAGE_MESA3D_GBM=y
+BR2_PACKAGE_MESA3D_OPENGL_GLX=y
+BR2_PACKAGE_MESA3D_OPENGL_EGL=y
+BR2_PACKAGE_MESA3D_OPENGL_ES=y
+BR2_PACKAGE_PROVIDES_LIBGBM="mesa3d"
+BR2_PACKAGE_XORG7=y
+BR2_PACKAGE_XSERVER_XORG_SERVER=y
+BR2_PACKAGE_XSERVER_XORG_SERVER_MODULAR=y
+BR2_PACKAGE_XLIB_LIBX11=y
+BR2_PACKAGE_XAPP_TWM=y
+BR2_PACKAGE_XAPP_XAUTH=y
+BR2_PACKAGE_XAPP_XCLOCK=y
+BR2_PACKAGE_XAPP_XINIT=y
+BR2_PACKAGE_XDRIVER_XF86_INPUT_LIBINPUT=y
+BR2_PACKAGE_XTERM=y
+BR2_PACKAGE_EUDEV=y
+BR2_ROOTFS_DEVICE_CREATION_DYNAMIC_EUDEV=y
+BR2_PACKAGE_PROVIDES_UDEV="eudev"
+BR2_PACKAGE_HAS_UDEV=y
+BR2_PACKAGE_LIBGLEW=y
+BR2_PACKAGE_HAS_LIBGBM=y
+BR2_PACKAGE_HAS_LIBGLES=y
+BR2_PACKAGE_LIBINPUT=y
+BR2_PACKAGE_LIBDRI2=y
diff --git a/device.h b/device.h
index 9d8d4cfc..02198be4 100644
--- a/device.h
+++ b/device.h
@@ -293,6 +293,60 @@ void virtio_input_drain_host_events(void);
 bool virtio_input_irq_pending(virtio_input_state_t *vinput);
 #endif /* SEMU_HAS(VIRTIOINPUT) */
 
+/* VirtIO-GPU */
+
+#if SEMU_HAS(VIRTIOGPU)
+
+#define IRQ_VGPU 9
+#define IRQ_VGPU_BIT (1 << IRQ_VGPU)
+
+typedef struct {
+    uint32_t QueueNum;
+    uint32_t QueueDesc;
+    uint32_t QueueAvail;
+    uint32_t QueueUsed;
+    uint16_t last_avail;
+    bool ready;
+} virtio_gpu_queue_t;
+
+typedef struct {
+    /* feature negotiation */
+    uint32_t DeviceFeaturesSel;
+    uint32_t DriverFeatures;
+    uint32_t DriverFeaturesSel;
+    /* queue config */
+    uint32_t QueueSel;
+    virtio_gpu_queue_t queues[2];
+    /* status */
+    uint32_t Status;
+    uint32_t InterruptStatus;
+    /* supplied by environment */
+    uint32_t *ram;
+    /* implementation-specific */
+    void *priv;
+} virtio_gpu_state_t;
+
+void virtio_gpu_read(hart_t *vm,
+                     virtio_gpu_state_t *vgpu,
+                     uint32_t addr,
+                     uint8_t width,
+                     uint32_t *value);
+
+void virtio_gpu_write(hart_t *vm,
+                      virtio_gpu_state_t *vgpu,
+                      uint32_t addr,
+                      uint8_t width,
+                      uint32_t value);
+
+/* Initializes the process-wide virtio-gpu singleton. semu currently supports
+ * one in-process GPU instance; a second call is fatal.
+ */
+void virtio_gpu_init(virtio_gpu_state_t *vgpu);
+uint32_t virtio_gpu_register_scanout(virtio_gpu_state_t *vgpu,
+                                     uint32_t width,
+                                     uint32_t height);
+#endif /* SEMU_HAS(VIRTIOGPU) */
+
 /* ACLINT MTIMER */
 typedef struct {
     /* A MTIMER device has two separate base addresses: one for the MTIME
@@ -514,10 +568,6 @@ typedef struct {
 #if SEMU_HAS(VIRTIORNG)
     virtio_rng_state_t vrng;
 #endif
-    /* ACLINT */
-    mtimer_state_t mtimer;
-    mswi_state_t mswi;
-    sswi_state_t sswi;
 #if SEMU_HAS(VIRTIOSND)
     virtio_snd_state_t vsnd;
 #endif
@@ -527,21 +577,29 @@ typedef struct {
 #if SEMU_HAS(VIRTIOINPUT)
     virtio_input_state_t vkeyboard;
     virtio_input_state_t vmouse;
-    /* Use self-pipe trick to unblock the emulator loop when the
-     * window backend has queued work, such as input events or
-     * window shutdown. When all harts are idle, semu_run() calls
-     * poll(-1) and blocks indefinitely waiting for timer or UART
-     * events. The window-event thread has no way to wake that
-     * blocked poll() other than writing to a file descriptor it is
+#endif
+#if SEMU_HAS(VIRTIOGPU)
+    virtio_gpu_state_t vgpu;
+#endif
+#if SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU)
+    /* Use self-pipe trick to unblock the emulator loop when the window backend
+     * has queued work, such as input events or window shutdown. When all harts
+     * are idle, 'semu_run()' can call 'poll(-1)' and block indefinitely
+     * waiting for timer or UART events. The window-event thread has no way to
+     * wake that blocked 'poll()' other than writing to a file descriptor it is
      * watching.
      *
-     * wake_fd[0] (read end) is added to pfds[] so poll() monitors it.
-     * wake_fd[1] (write end) is handed to the window backend, which
-     * writes one byte when backend work arrives to make wake_fd[0]
-     * readable and return poll() immediately.
+     * 'wake_fd[0]' (read end) is added to 'pfds[]' so 'poll()' monitors it.
+     * 'wake_fd[1]' (write end) is handed to the window backend, which
+     * writes one byte when backend work arrives to make 'wake_fd[0]'
+     * readable and return 'poll()' immediately.
      */
     int wake_fd[2];
 #endif
+    /* ACLINT */
+    mtimer_state_t mtimer;
+    mswi_state_t mswi;
+    sswi_state_t sswi;
 
     uint32_t peripheral_update_ctr;
 
diff --git a/feature.h b/feature.h
index 0ff4e1b0..bca97602 100644
--- a/feature.h
+++ b/feature.h
@@ -32,5 +32,10 @@
 #define SEMU_FEATURE_EXTERNAL_ROOT 0
 #endif
 
+/* virtio-gpu */
+#ifndef SEMU_FEATURE_VIRTIOGPU
+#define SEMU_FEATURE_VIRTIOGPU 1
+#endif
+
 /* Feature test macro */
 #define SEMU_HAS(x) SEMU_FEATURE_##x
diff --git a/main.c b/main.c
index 8e113a6f..a0b53334 100644
--- a/main.c
+++ b/main.c
@@ -28,10 +28,16 @@
 #include "mini-gdbstub/include/gdbstub.h"
 #if SEMU_HAS(VIRTIOINPUT)
 #include "virtio-input-event.h"
+#endif
+#if SEMU_HAS(VIRTIOGPU)
+#include "vgpu-display.h"
+#endif
+#if SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU)
 #include "window.h"
 #endif
 #include "riscv.h"
 #include "riscv_private.h"
+
 #define PRIV(x) ((emu_state_t *) x->priv)
 
 /* Forward declarations for coroutine support */
@@ -139,6 +145,18 @@ static void emu_update_vinput_mouse_interrupts(vm_t *vm)
 }
 #endif
 
+#if SEMU_HAS(VIRTIOGPU)
+static void emu_update_vgpu_interrupts(vm_t *vm)
+{
+    emu_state_t *data = PRIV(vm->hart[0]);
+    if (data->vgpu.InterruptStatus)
+        data->plic.active |= IRQ_VGPU_BIT;
+    else
+        data->plic.active &= ~IRQ_VGPU_BIT;
+    plic_update_interrupts(vm, &data->plic);
+}
+#endif
+
 static void emu_update_timer_interrupt(hart_t *hart)
 {
     emu_state_t *data = PRIV(hart);
@@ -248,7 +266,8 @@ static inline void emu_tick_peripherals(emu_state_t *emu)
 
         if (virtio_input_irq_pending(&emu->vmouse))
             emu_update_vinput_mouse_interrupts(vm);
-
+#endif
+#if SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU)
         /* A closed window is treated like a frontend shutdown request. */
         if (g_window.window_is_closed())
             emu->stopped = true;
@@ -320,12 +339,15 @@ static void mem_load(hart_t *hart,
         case 0x49: /* virtio-input keyboard */
             virtio_input_read(hart, &data->vkeyboard, addr & 0xFFFFF, width,
                               value);
-            emu_update_vinput_keyboard_interrupts(hart->vm);
             return;
         case 0x4A: /* virtio-input mouse */
             virtio_input_read(hart, &data->vmouse, addr & 0xFFFFF, width,
                               value);
-            emu_update_vinput_mouse_interrupts(hart->vm);
+            return;
+#endif
+#if SEMU_HAS(VIRTIOGPU)
+        case 0x4B: /* virtio-gpu */
+            virtio_gpu_read(hart, &data->vgpu, addr & 0xFFFFF, width, value);
             return;
 #endif
         }
@@ -414,6 +436,12 @@ static void mem_store(hart_t *hart,
                                value);
             emu_update_vinput_mouse_interrupts(hart->vm);
             return;
+#endif
+#if SEMU_HAS(VIRTIOGPU)
+        case 0x4B: /* virtio-gpu */
+            virtio_gpu_write(hart, &data->vgpu, addr & 0xFFFFF, width, value);
+            emu_update_vgpu_interrupts(hart->vm);
+            return;
 #endif
         }
     }
@@ -848,7 +876,7 @@ static int semu_init(emu_state_t *emu, int argc, char **argv)
     handle_options(argc, argv, &kernel_file, &dtb_file, &initrd_file,
                    &disk_file, &netdev, &hart_count, &debug, &headless,
                    &shared_dir);
-#if !SEMU_HAS(VIRTIOINPUT)
+#if !SEMU_HAS(VIRTIOINPUT) && !SEMU_HAS(VIRTIOGPU)
     (void) headless;
 #endif
 
@@ -993,25 +1021,48 @@ static int semu_init(emu_state_t *emu, int argc, char **argv)
 #endif
 
 #if SEMU_HAS(VIRTIOINPUT)
-    g_window.window_init(headless);
-
     emu->vkeyboard.ram = emu->ram;
     virtio_input_init(&(emu->vkeyboard));
 
     emu->vmouse.ram = emu->ram;
     virtio_input_init(&(emu->vmouse));
+#endif
+
+#if SEMU_HAS(VIRTIOGPU)
+    emu->vgpu.ram = emu->ram;
+    virtio_gpu_init(&(emu->vgpu));
+    uint32_t scanout_id =
+        virtio_gpu_register_scanout(&(emu->vgpu), SCREEN_WIDTH, SCREEN_HEIGHT);
+    vgpu_display_set_scanout_count(scanout_id + 1U);
+#endif
+
+#if SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU)
+    g_window.window_init(headless, SCREEN_WIDTH, SCREEN_HEIGHT);
 
     emu->wake_fd[0] = emu->wake_fd[1] = -1;
     if (vm->n_hart > 1 && g_window.window_main_loop) {
         if (pipe(emu->wake_fd) < 0) {
-            perror("pipe");
-            return 2;
+            perror("failed to create emulator wake pipe");
+            g_window.window_cleanup();
+            return EXIT_FAILURE;
         }
-        /* Make the write end non-blocking so window_shutdown_sw() never
-         * stalls. Single-hart mode never blocks in poll(-1), so it does not
-         * need the wake pipe at all.
+
+        /* Make the write end non-blocking so 'window_shutdown_sw()' never
+         * stalls. The read end remains blocking because 'semu_run()' reads it
+         * only after 'poll()' reports 'POLLIN' on the same emulator thread.
          */
-        fcntl(emu->wake_fd[1], F_SETFL, O_NONBLOCK);
+        int flags = fcntl(emu->wake_fd[1], F_GETFL, 0);
+        if (flags < 0 ||
+            fcntl(emu->wake_fd[1], F_SETFL, flags | O_NONBLOCK) < 0) {
+            perror(
+                "failed to configure emulator wake pipe write end as "
+                "non-blocking");
+            close(emu->wake_fd[0]);
+            close(emu->wake_fd[1]);
+            emu->wake_fd[0] = emu->wake_fd[1] = -1;
+            g_window.window_cleanup();
+            return EXIT_FAILURE;
+        }
     }
 #endif
 
@@ -1028,6 +1079,14 @@ static int semu_init(emu_state_t *emu, int argc, char **argv)
         if (!coro_init(total_slots, vm->n_hart)) {
             fprintf(stderr, "Failed to initialize coroutine subsystem\n");
             fflush(stderr);
+#if SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU)
+            if (emu->wake_fd[0] >= 0)
+                close(emu->wake_fd[0]);
+            if (emu->wake_fd[1] >= 0)
+                close(emu->wake_fd[1]);
+            emu->wake_fd[0] = emu->wake_fd[1] = -1;
+            g_window.window_cleanup();
+#endif
             return 1;
         }
 
@@ -1036,6 +1095,14 @@ static int semu_init(emu_state_t *emu, int argc, char **argv)
             if (!coro_create_hart(i, hart_exec_loop, vm->hart[i])) {
                 fprintf(stderr, "Failed to create coroutine for hart %u\n", i);
                 coro_cleanup();
+#if SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU)
+                if (emu->wake_fd[0] >= 0)
+                    close(emu->wake_fd[0]);
+                if (emu->wake_fd[1] >= 0)
+                    close(emu->wake_fd[1]);
+                emu->wake_fd[0] = emu->wake_fd[1] = -1;
+                g_window.window_cleanup();
+#endif
                 return 1;
             }
         }
@@ -1233,6 +1300,24 @@ static void signal_handler(int sig UNUSED)
     }
 }
 
+#if SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU)
+static void semu_close_wake_pipe(emu_state_t *emu)
+{
+    signal_wake_fd = -1;
+    if (g_window.window_set_wake_fd)
+        g_window.window_set_wake_fd(-1);
+
+    if (emu->wake_fd[0] >= 0) {
+        close(emu->wake_fd[0]);
+        emu->wake_fd[0] = -1;
+    }
+    if (emu->wake_fd[1] >= 0) {
+        close(emu->wake_fd[1]);
+        emu->wake_fd[1] = -1;
+    }
+}
+#endif
+
 #ifdef MMU_CACHE_STATS
 static void print_mmu_cache_stats(vm_t *vm)
 {
@@ -1404,10 +1489,10 @@ static void semu_run(emu_state_t *emu)
             if (signal_received)
                 break;
             /* Only need fds for timer and UART (no coroutine I/O),
-             * plus an optional wake pipe when VIRTIOINPUT is enabled.
+             * plus an optional wake pipe when a window backend is enabled.
              */
             size_t needed = 2;
-#if SEMU_HAS(VIRTIOINPUT)
+#if SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU)
             if (emu->wake_fd[0] >= 0)
                 needed++;
 #endif
@@ -1422,15 +1507,6 @@ static void semu_run(emu_state_t *emu)
                     close(kq);
 #else
                     close(wfi_timer_fd);
-#endif
-#if SEMU_HAS(VIRTIOINPUT)
-                    /* Mirror the normal-exit cleanup so the wake pipe
-                     * does not leak across the early return.
-                     */
-                    if (emu->wake_fd[0] >= 0)
-                        close(emu->wake_fd[0]);
-                    if (emu->wake_fd[1] >= 0)
-                        close(emu->wake_fd[1]);
 #endif
                     emu->exit_code = -1;
                     return;
@@ -1517,9 +1593,9 @@ static void semu_run(emu_state_t *emu)
                 pfd_count++;
             }
 
-#if SEMU_HAS(VIRTIOINPUT)
+#if SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU)
             /* Always watch the wake pipe so that backend work such as input
-             * events or SDL window close unblocks poll(-1) immediately.
+             * events or SDL window close unblocks 'poll(-1)' immediately.
              */
             int wake_pfd_index = -1;
             if (emu->wake_fd[0] >= 0 && pfd_count < poll_capacity) {
@@ -1580,17 +1656,17 @@ static void semu_run(emu_state_t *emu)
 #endif
                 }
             } else if (nevents < 0 && errno != EINTR) {
-                perror("poll");
+                perror("failed to poll emulator events");
             }
 
-#if SEMU_HAS(VIRTIOINPUT)
+#if SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU)
             /* Drain one wake byte if the pipe fired. The virtio-input path
              * coalesces backend wakeups behind a bool gate, so it contributes
              * at most one queued notification byte before the emulator thread
              * drains pending work. Extra shutdown wake bytes do not need to be
              * fully consumed here because the first one is enough to make
-             * emu_tick_peripherals() observe g_window.window_is_closed() and
-             * stop the emulator.
+             * 'emu_tick_peripherals()' observe 'g_window.window_is_closed()'
+             * and stop the emulator.
              */
             if (wake_pfd_index >= 0 &&
                 (pfds[wake_pfd_index].revents & POLLIN)) {
@@ -1626,12 +1702,6 @@ static void semu_run(emu_state_t *emu)
         close(kq);
 #else
         close(wfi_timer_fd);
-#endif
-#if SEMU_HAS(VIRTIOINPUT)
-        if (emu->wake_fd[0] >= 0)
-            close(emu->wake_fd[0]);
-        if (emu->wake_fd[1] >= 0)
-            close(emu->wake_fd[1]);
 #endif
         /* Free coroutine stacks/contexts from coro_init() above so the
          * graceful-exit path matches what coro_create_hart()'s failure
@@ -1640,7 +1710,7 @@ static void semu_run(emu_state_t *emu)
         coro_cleanup();
 
         /* A closed window is a normal user action, not an error. */
-#if SEMU_HAS(VIRTIOINPUT)
+#if SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU)
         if (emu->stopped && !g_window.window_is_closed())
 #else
         if (emu->stopped)
@@ -1748,7 +1818,7 @@ static gdb_action_t semu_cont(void *args)
      * commands can run guest code again.
      */
     signal_received = 0;
-#if SEMU_HAS(VIRTIOINPUT)
+#if SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU)
     while (!semu_is_interrupt(emu) && !g_window.window_is_closed()) {
 #else
     while (!semu_is_interrupt(emu)) {
@@ -1764,7 +1834,7 @@ static gdb_action_t semu_cont(void *args)
     /* Clear the interrupt if it's pending */
     __atomic_store_n(&emu->is_interrupted, false, __ATOMIC_RELAXED);
 
-#if SEMU_HAS(VIRTIOINPUT)
+#if SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU)
     /* Tell gdbstub_run() to exit cleanly when the window is closed. */
     if (g_window.window_is_closed())
         return ACT_SHUTDOWN;
@@ -1839,8 +1909,10 @@ static void semu_run_debug(emu_state_t *emu)
     emu->exit_code = ok ? 0 : 1;
 }
 
-#if SEMU_HAS(VIRTIOINPUT)
-/* Thread wrapper for running emulator in background thread */
+#if SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU)
+/* Thread wrapper for backends that reserve the main thread for
+ * 'window_main_loop()'.
+ */
 static void *emu_thread_func(void *arg)
 {
     emu_state_t *emu = (emu_state_t *) arg;
@@ -1850,7 +1922,7 @@ static void *emu_thread_func(void *arg)
     else
         semu_run(emu);
 
-    /* Unblock window_main_loop() on the main thread so it can return */
+    /* Unblock 'window_main_loop()' on the main thread so it can return. */
     if (g_window.window_shutdown)
         g_window.window_shutdown();
 
@@ -1885,15 +1957,15 @@ int main(int argc, char **argv)
         sigaction(SIGTERM, &sa, NULL);
     }
 
-#if SEMU_HAS(VIRTIOINPUT)
+#if SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU)
     /* Publish the wake pipe to the signal handler so SIGINT/SIGTERM can
      * unblock the emulator thread's poll() in the threaded window path.
      */
     if (emu.wake_fd[1] >= 0)
         signal_wake_fd = emu.wake_fd[1];
 
-    /* If window backend has a main loop function, run emulator in background
-     * thread and use main thread for window events (required for macOS SDL2).
+    /* If the window backend provides 'window_main_loop()', run the emulator in
+     * a background thread and use the main thread for window events.
      */
     if (g_window.window_main_loop) {
         pthread_t emu_thread;
@@ -1903,18 +1975,18 @@ int main(int argc, char **argv)
 
         if (pthread_create(&emu_thread, NULL, emu_thread_func, &emu) != 0) {
             fprintf(stderr, "Failed to create emulator thread\n");
+            semu_close_wake_pipe(&emu);
+            g_window.window_cleanup();
             return 1;
         }
 
-        /* Main thread runs window event loop (required for macOS) */
-        g_window.window_main_loop();
-
-        /* window_main_loop() returns either because the user closed the window
-         * (SDL_QUIT) or because the emulator called window_shutdown().
-         * emu_tick_peripherals() picks up g_window.window_is_closed() and
-         * sets emu->stopped, so no direct write to emu.stopped is needed
-         * here.
+        /* Main thread runs window event loop. Returns either because the user
+         * closed the window ('SDL_QUIT') or because the emulator called
+         * 'window_shutdown()'. 'emu_tick_peripherals()' picks up the window
+         * backend's closed state and sets 'emu->stopped', so no direct write to
+         * 'emu.stopped' is needed here.
          */
+        g_window.window_main_loop();
 
         /* Wait for emulator thread to finish. */
         pthread_join(emu_thread, NULL);
@@ -1927,6 +1999,11 @@ int main(int argc, char **argv)
             semu_run(&emu);
     }
 
+#if SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU)
+    semu_close_wake_pipe(&emu);
+    g_window.window_cleanup();
+#endif
+
 #ifdef MMU_CACHE_STATS
     print_mmu_cache_stats(&emu.vm);
 #endif
diff --git a/minimal.dts b/minimal.dts
index 709d1872..2b604236 100644
--- a/minimal.dts
+++ b/minimal.dts
@@ -115,5 +115,13 @@
             interrupts = <8>;
         };
 #endif
+
+#if SEMU_FEATURE_VIRTIOGPU
+        gpu0: virtio@4b00000 {
+            compatible = "virtio,mmio";
+            reg = <0x4b00000 0x200>;
+            interrupts = <9>;
+        };
+#endif
     };
 };
diff --git a/mk/external.mk b/mk/external.mk
index ec7e9139..1c86ec7a 100644
--- a/mk/external.mk
+++ b/mk/external.mk
@@ -39,6 +39,10 @@ $(PREBUILT_MANIFEST): FORCE
 	    fi; \
 	fi
 
+# optional test tools disk
+TEST_TOOLS_DATA_URL = $(COMMON_URL)/test-tools.img.bz2
+TEST_TOOLS_DATA = test-tools.img
+
 define download
 # Download to a .part file so an interrupted curl never lands a
 # corrupt or incomplete .bz2 that a later run mistakes for valid input.
@@ -46,14 +50,16 @@ define download
 # left over from a previous run, e.g. interrupted before sha1 verify,
 # would make curl request a byte range past EOF, the server replies
 # HTTP 416, and curl exits non-zero, a permanent self-inflicted
-# deadlock. These files are 5 to 7 MiB; a fresh GET is cheap.
+# deadlock. These files are small enough that a fresh GET is cheap.
 #
 # Look up the expected SHA-1 by archive basename in the release
-# manifest, then verify the .part against it. Decompress to a .tmp
-# file and rename only on success, so an interrupted bunzip2 cannot
-# leave a half-decompressed Image or rootfs.cpio that make would treat
-# as a valid up-to-date target on the next invocation.
-$($(T)_DATA): $(PREBUILT_MANIFEST) | prebuilt-check
+# manifest, then verify the .part against it. Keep the manifest
+# order-only so an existing PR-built artifact is not considered stale
+# just because the manifest was refreshed. Decompress to a .tmp file
+# and rename only on success, so an interrupted bunzip2 cannot leave a
+# half-decompressed artifact that make would treat as a valid
+# up-to-date target on the next invocation.
+$($(T)_DATA): | $(PREBUILT_MANIFEST) prebuilt-check
 	$(VECHO) "  GET\t$$@\n"
 	$(Q)curl --fail --retry 3 --retry-delay 1 --progress-bar \
 	    -L -o "$$@.bz2.part" "$(strip $($(T)_DATA_URL))" \
@@ -69,18 +75,18 @@ $($(T)_DATA): $(PREBUILT_MANIFEST) | prebuilt-check
 	$(Q)rm -f "$$@.bz2"
 endef
 
-EXTERNAL_DATA = KERNEL INITRD
+EXTERNAL_DATA = KERNEL INITRD TEST_TOOLS
 $(foreach T,$(EXTERNAL_DATA),$(eval $(download)))
 
 # --- Stale-prebuilt detection -------------------------------------------
 #
-# The prebuilt Image and rootfs.cpio above are baked from a fixed set of
-# input files (kernel/buildroot/busybox configs, the build script, and
-# the init stub). When any of those change locally the prebuilt may no
+# The prebuilt Image, rootfs.cpio, and test-tools.img above are baked from a
+# fixed set of input files (kernel/buildroot/busybox configs, the build script,
+# and the init stub). When any of those change locally the prebuilt may no
 # longer reflect the user's intent, so we compute the SHA1 of those
 # inputs and compare against the publisher's recorded inputs hash --
-# the third line of prebuilt.sha1, written by .ci/publish-prebuilt.sh
-# under the virtual name 'inputs'.
+# the line of prebuilt.sha1 written by .ci/publish-prebuilt.sh under
+# the virtual name 'inputs'.
 #
 # Mismatch -> warn but do not auto-rebuild: a buildroot run takes the
 # better part of an hour, so we let the user opt in via make build-image.
@@ -90,9 +96,12 @@ PREBUILT_INPUTS := \
     configs/linux.config \
     configs/busybox.config \
     configs/buildroot.config \
+    configs/x11.config \
+    configs/riscv-cross-file \
     scripts/build-image.sh \
     scripts/rootfs_ext4.sh \
-    target/init
+    target/init \
+    target/local-env.sh
 
 # Read the publisher's inputs hash from the downloaded manifest at
 # recipe time, after the manifest refresh above has had a chance to run.
@@ -106,10 +115,10 @@ prebuilt-check: $(PREBUILT_MANIFEST)
 	        if [ "$$found" -eq "$$expected" ]; then \
 	            live_sha1=$$(cat $(PREBUILT_INPUTS) | $(SHA1SUM) | awk '{print $$1}'); \
 	            if [ "$$live_sha1" != "$$manifest_sha1" ]; then \
-	                echo "warning: Local kernel/rootfs inputs ($$live_sha1) differ from" >&2; \
+	                echo "warning: Local prebuilt guest inputs ($$live_sha1) differ from" >&2; \
 	                echo "warning: the prebuilt's recorded inputs ($$manifest_sha1)." >&2; \
-	                echo "warning: The downloaded Image/rootfs.cpio do not reflect your local" >&2; \
-	                echo "warning: configs. Run \`make build-image\` to rebuild from source." >&2; \
+	                echo "warning: The downloaded guest artifacts do not reflect your local configs." >&2; \
+	                echo "warning: Run \`make build-image\` to rebuild from source." >&2; \
 	            fi; \
 	        fi; \
 	    fi
diff --git a/scripts/build-image.sh b/scripts/build-image.sh
index a805443d..f3543358 100755
--- a/scripts/build-image.sh
+++ b/scripts/build-image.sh
@@ -19,6 +19,12 @@ function OK
 
 PARALLEL="-j$(nproc)"
 
+DIRECTFB2_REPO="https://github.com/directfb2/DirectFB2"
+DIRECTFB2_REV="7d4682d0cc092ed2f28c903175d1a0c104e9e9a8"
+DIRECTFB_EXAMPLES_REPO="https://github.com/directfb2/DirectFB-examples"
+DIRECTFB_EXAMPLES_REV="eecf1019b29933a45578e62aea5f08a884d30fbc"
+TEST_TOOLS_SIZE_MB=192
+
 function safe_copy {
     local src="$1"
     local dst="$2"
@@ -31,16 +37,64 @@ function safe_copy {
     fi
 }
 
-function do_buildroot
+function checkout_repo_rev
 {
-    if [ ! -d buildroot ]; then
-        echo "Cloning Buildroot..."
-        ASSERT git clone https://github.com/buildroot/buildroot -b 2025.02.x --depth=1
+    local dir="$1"
+    local repo="$2"
+    local rev="$3"
+
+    if [ ! -d "$dir/.git" ]; then
+        echo "Cloning $dir..."
+        ASSERT git clone "$repo" "$dir"
     else
-        echo "buildroot/ already exists, skipping clone"
+        echo "$dir already exists, reusing clone..."
     fi
 
-    safe_copy configs/buildroot.config buildroot/.config
+    pushd "$dir"
+    if ! git cat-file -e "$rev^{commit}" 2>/dev/null; then
+        ASSERT git fetch origin
+    fi
+    ASSERT git checkout --detach "$rev"
+    popd
+}
+
+function meson_setup_or_reconfigure
+{
+    local build_dir="$1"
+    shift
+
+    if [ -f "$build_dir/build.ninja" ]; then
+        if ! meson setup --reconfigure "$@" "$build_dir"; then
+            echo "Recreating stale Meson build directory: $build_dir"
+            rm -rf "$build_dir"
+            ASSERT meson setup "$@" "$build_dir"
+        fi
+    else
+        ASSERT meson setup "$@" "$build_dir"
+    fi
+}
+
+function configure_buildroot
+{
+    local mode="${1:-default}"
+    local buildroot_config="configs/buildroot.config"
+    local x11_config="configs/x11.config"
+    local merge_tool="buildroot/support/kconfig/merge_config.sh"
+
+    if [[ "$mode" == "x11" ]]; then
+        echo "Preparing Buildroot config with X11 fragment..."
+        ASSERT "$merge_tool" -m -r -O buildroot "$buildroot_config" "$x11_config"
+    else
+        echo "Preparing default Buildroot config..."
+        cp -f "$buildroot_config" buildroot/.config
+    fi
+}
+
+function build_buildroot_rootfs
+{
+    local mode="${1:-default}"
+
+    configure_buildroot "$mode"
     safe_copy configs/busybox.config buildroot/busybox.config
     cp -f target/init buildroot/fs/cpio/init
 
@@ -50,8 +104,25 @@ function do_buildroot
     unset LD_LIBRARY_PATH
     pushd buildroot
     ASSERT make olddefconfig
+    if [[ "$mode" == "x11" && \
+          ! -x output/host/bin/riscv32-buildroot-linux-gnu-g++ ]]; then
+        echo "Rebuilding Buildroot final GCC with C++ support..."
+        ASSERT make host-gcc-final-dirclean
+    fi
     ASSERT make $PARALLEL
     popd
+}
+
+function do_buildroot
+{
+    if [ ! -d buildroot ]; then
+        echo "Cloning Buildroot..."
+        ASSERT git clone https://github.com/buildroot/buildroot -b 2025.02.x --depth=1
+    else
+        echo "buildroot/ already exists, skipping clone"
+    fi
+
+    build_buildroot_rootfs default
 
     # Always publish the cpio. It is the canonical buildroot output and
     # serves both as the source for the ext4 image and as the legacy
@@ -67,6 +138,27 @@ function do_buildroot
         echo "Skipping ext4.img build (--no-ext4)"
     else
         ASSERT ./scripts/rootfs_ext4.sh ./rootfs.cpio ./ext4.img
+
+        local test_tools_rootfs=./rootfs.cpio
+        if [[ $BUILD_X11 -eq 1 ]]; then
+            build_buildroot_rootfs x11
+            test_tools_rootfs=./buildroot/output/images/rootfs.cpio
+        fi
+
+        if [[ $BUILD_DIRECTFB_TEST -eq 1 ]]; then
+            do_extra_packages
+            if [[ $BUILD_X11 -eq 1 ]]; then
+                stage_cxx_runtime
+            fi
+            ASSERT ./scripts/rootfs_ext4.sh "$test_tools_rootfs" ./test-tools.img \
+                "$TEST_TOOLS_SIZE_MB" ./extra_packages
+        elif [[ $BUILD_X11 -eq 1 ]]; then
+            rm -rf extra_packages
+            mkdir -p extra_packages
+            stage_cxx_runtime
+            ASSERT ./scripts/rootfs_ext4.sh "$test_tools_rootfs" ./test-tools.img \
+                "$TEST_TOOLS_SIZE_MB" ./extra_packages
+        fi
     fi
 }
 
@@ -91,24 +183,99 @@ function do_linux
     popd
 }
 
+function do_directfb
+{
+    export PATH="$PWD/buildroot/output/host/bin:$PATH"
+    export BUILDROOT_OUT=$PWD/buildroot/output/
+    export DIRECTFB_STAGE=$PWD/directfb
+    mkdir -p directfb
+
+    # Build DirectFB2
+    checkout_repo_rev DirectFB2 "$DIRECTFB2_REPO" "$DIRECTFB2_REV"
+    pushd DirectFB2
+    cp ../configs/riscv-cross-file .
+    meson_setup_or_reconfigure build/riscv -Ddrmkms=true --cross-file \
+        riscv-cross-file
+    ASSERT meson compile -C build/riscv
+    ASSERT env DESTDIR=$BUILDROOT_OUT/host/riscv32-buildroot-linux-gnu/sysroot meson install -C build/riscv
+    ASSERT env DESTDIR=$DIRECTFB_STAGE meson install -C build/riscv
+    popd
+
+    # Build DirectFB2 examples
+    checkout_repo_rev DirectFB-examples "$DIRECTFB_EXAMPLES_REPO" \
+        "$DIRECTFB_EXAMPLES_REV"
+    pushd DirectFB-examples/
+    cp ../configs/riscv-cross-file .
+    meson_setup_or_reconfigure build/riscv --cross-file riscv-cross-file
+    ASSERT meson compile -C build/riscv
+    ASSERT env DESTDIR=$DIRECTFB_STAGE meson install -C build/riscv
+    popd
+}
+
+function do_extra_packages
+{
+    export PATH="$PWD/buildroot/output/host/bin:$PATH"
+    export CROSS_COMPILE=riscv32-buildroot-linux-gnu-
+
+    rm -rf directfb extra_packages
+    mkdir -p directfb
+    mkdir -p extra_packages
+    mkdir -p extra_packages/root
+
+    do_directfb && OK
+
+    if ! find directfb -mindepth 1 -print -quit | grep -q .; then
+        echo "Error: DirectFB staging tree is empty."
+        exit 1
+    fi
+
+    ASSERT cp -r directfb/. extra_packages/
+    ASSERT cp target/local-env.sh extra_packages/root/
+}
+
+function stage_cxx_runtime
+{
+    local toolchain_lib="buildroot/output/host/riscv32-buildroot-linux-gnu/lib"
+    local libstdcpp="$toolchain_lib/libstdc++.so.6"
+    local libstdcpp_real
+
+    if [ ! -e "$libstdcpp" ]; then
+        echo "Error: libstdc++.so.6 not found in $toolchain_lib"
+        exit 1
+    fi
+
+    libstdcpp_real="$(readlink "$libstdcpp" || basename "$libstdcpp")"
+    if [[ "$libstdcpp_real" != /* ]]; then
+        libstdcpp_real="$toolchain_lib/$libstdcpp_real"
+    fi
+    mkdir -p extra_packages/lib
+    ASSERT cp -a "$toolchain_lib/libstdc++.so" "$libstdcpp" \
+        "$libstdcpp_real" extra_packages/lib/
+}
+
 function show_help {
     cat << EOF
-Usage: $0 [--buildroot] [--linux] [--all] [--no-ext4] [--clean-build] [--help]
+Usage: $0 [--buildroot] [--x11] [--linux] [--directfb2-test] [--all] [--no-ext4] [--clean-build] [--help]
 
 Options:
   --buildroot         Build Buildroot userland (produces rootfs.cpio and,
                       unless --no-ext4 is given, ext4.img for vda boot)
+  --x11               Build test-tools.img from an X11-enabled rootfs
+  --directfb2-test    Overlay the DirectFB2 test payload into test-tools.img
   --linux             Build the Linux kernel
   --all               Build both Buildroot and Linux
   --no-ext4           Skip ext4.img generation; produce only rootfs.cpio
                       (matches the legacy ENABLE_EXTERNAL_ROOT=0 path)
-  --clean-build       Remove buildroot/ and/or linux/ before building
+  --clean-build       Remove buildroot/ and/or linux/ before building;
+                      with --directfb2-test, also remove DirectFB2 sources
   --help              Show this message
 EOF
     exit 1
 }
 
 BUILD_BUILDROOT=0
+BUILD_X11=0
+BUILD_DIRECTFB_TEST=0
 BUILD_LINUX=0
 NO_EXT4=0
 CLEAN_BUILD=0
@@ -118,6 +285,14 @@ while [[ $# -gt 0 ]]; do
         --buildroot)
             BUILD_BUILDROOT=1
             ;;
+        --x11)
+            BUILD_BUILDROOT=1
+            BUILD_X11=1
+            ;;
+        --directfb2-test)
+            BUILD_BUILDROOT=1
+            BUILD_DIRECTFB_TEST=1
+            ;;
         --linux)
             BUILD_LINUX=1
             ;;
@@ -147,6 +322,11 @@ if [[ $BUILD_BUILDROOT -eq 0 && $BUILD_LINUX -eq 0 ]]; then
     show_help
 fi
 
+if [[ ( $BUILD_DIRECTFB_TEST -eq 1 || $BUILD_X11 -eq 1 ) && $NO_EXT4 -eq 1 ]]; then
+    echo "Error: --x11/--directfb2-test requires an ext4 image; remove --no-ext4."
+    show_help
+fi
+
 if [[ $CLEAN_BUILD -eq 1 && $BUILD_BUILDROOT -eq 1 && -d buildroot ]]; then
     echo "Removing buildroot/ for clean build..."
     rm -rf buildroot
@@ -157,6 +337,11 @@ if [[ $CLEAN_BUILD -eq 1 && $BUILD_LINUX -eq 1 && -d linux ]]; then
     rm -rf linux
 fi
 
+if [[ $CLEAN_BUILD -eq 1 && $BUILD_DIRECTFB_TEST -eq 1 ]]; then
+    echo "Removing DirectFB2 sources for clean build..."
+    rm -rf DirectFB2 DirectFB-examples directfb extra_packages
+fi
+
 if [[ $BUILD_BUILDROOT -eq 1 ]]; then
     do_buildroot && OK
 fi
diff --git a/scripts/rootfs_ext4.sh b/scripts/rootfs_ext4.sh
index 9cb03725..c8d783a0 100755
--- a/scripts/rootfs_ext4.sh
+++ b/scripts/rootfs_ext4.sh
@@ -2,17 +2,20 @@
 #
 # Build an ext4 rootfs image from an existing cpio archive.
 #
-# Usage: rootfs_ext4.sh [SOURCE_CPIO] [OUT_IMG] [SIZE_MB]
+# Usage: rootfs_ext4.sh [SOURCE_CPIO] [OUT_IMG] [SIZE_MB] [EXTRA_DIR]
 #
 # Default values match the EXTROOT make path: read rootfs.cpio, produce
 # ext4.img sized at 32 MiB. The 32 MiB default fits the buildroot userland
-# with headroom; bump SIZE_MB for larger rootfs payloads.
+# with headroom; bump SIZE_MB for larger rootfs payloads. EXTRA_DIR, when
+# given, is copied into the ext4 image after SOURCE_CPIO is extracted without
+# changing SOURCE_CPIO itself.
 
 set -euo pipefail
 
 SRC_CPIO="${1:-rootfs.cpio}"
 OUT_IMG="${2:-ext4.img}"
 SIZE_MB="${3:-32}"
+EXTRA_DIR="${4:-}"
 MKFS_EXT4="${MKFS_EXT4:-mkfs.ext4}"
 
 if [ ! -f "$SRC_CPIO" ]; then
@@ -32,26 +35,54 @@ fi
 
 SRC_DIR="$(cd "$(dirname "$SRC_CPIO")" && pwd -P)"
 SRC_ABS="$SRC_DIR/$(basename "$SRC_CPIO")"
+OUT_DIR="$(cd "$(dirname "$OUT_IMG")" && pwd -P)"
+OUT_ABS="$OUT_DIR/$(basename "$OUT_IMG")"
+EXTRA_ABS=""
+if [ -n "$EXTRA_DIR" ]; then
+    if [ ! -d "$EXTRA_DIR" ]; then
+        echo "[!] Extra directory not found: $EXTRA_DIR" >&2
+        exit 1
+    fi
+    EXTRA_ABS="$(cd "$EXTRA_DIR" && pwd -P)"
+fi
 # `mktemp -d -t PREFIX` differs between GNU (PREFIX is a name) and BSD (PREFIX
 # is a template) -- spell out the full template instead.
 STAGE="$(mktemp -d "${TMPDIR:-/tmp}/semu-rootfs.XXXXXX")"
-trap 'rm -rf "$STAGE"' EXIT
-
-echo "[*] Extracting $SRC_CPIO -> $STAGE"
-( cd "$STAGE" && fakeroot bash -c "cpio -idm < '$SRC_ABS'" )
+OUT_TMP="$(mktemp "$OUT_DIR/.$(basename "$OUT_IMG").XXXXXX")"
+trap 'rm -rf "$STAGE" "$OUT_TMP"' EXIT
 
 echo "[*] Creating empty image: $OUT_IMG (${SIZE_MB} MiB)"
 # bs=1024k works on both GNU and BSD dd; bs=1M is GNU-only and bs=1m is
 # BSD-only.
-dd if=/dev/zero of="$OUT_IMG" bs=1024k count="$SIZE_MB" >/dev/null 2>&1
+dd if=/dev/zero of="$OUT_TMP" bs=1024k count="$SIZE_MB" >/dev/null 2>&1
 
 echo "[*] Building ext4 filesystem"
+echo "[*] Extracting $SRC_CPIO -> $STAGE"
+if [ -n "$EXTRA_ABS" ]; then
+    echo "[*] Applying extra files: $EXTRA_DIR"
+fi
 # -E lazy_*_init=0: do all init at mkfs time so the first guest mount does
 #   not pay the lazy-init cost. Stripping the journal (-O ^has_journal)
 #   would also speed mount, but the prebuilt Linux Image is built with
 #   CONFIG_EXT4_USE_FOR_EXT2=n and refuses to mount a no-journal image.
-fakeroot "$MKFS_EXT4" -q -F \
-    -E lazy_itable_init=0,lazy_journal_init=0 \
-    -d "$STAGE" "$OUT_IMG"
+fakeroot bash -c '
+        set -e
+        stage="$1"
+        src_cpio="$2"
+        extra_dir="$3"
+        mkfs_ext4="$4"
+        out_img="$5"
+
+        cd "$stage"
+        cpio -idm < "$src_cpio"
+        if [ -n "$extra_dir" ]; then
+            cp -a "$extra_dir"/. .
+        fi
+        chown -R 0:0 .
+        "$mkfs_ext4" -q -F \
+            -E lazy_itable_init=0,lazy_journal_init=0 \
+            -d . "$out_img"
+    ' sh "$STAGE" "$SRC_ABS" "$EXTRA_ABS" "$MKFS_EXT4" "$OUT_TMP"
 
-du -h "$OUT_IMG"
+mv -f "$OUT_TMP" "$OUT_ABS"
+du -h "$OUT_ABS"
diff --git a/target/local-env.sh b/target/local-env.sh
new file mode 100755
index 00000000..afda50d0
--- /dev/null
+++ b/target/local-env.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+# Guest-side environment for manually overlaid user payloads. Buildroot
+# packages normally live under /usr, while Meson/autotools payloads staged
+# through DESTDIR commonly keep their default /usr/local prefix.
+
+add_path()
+{
+    [ -d "$1" ] || return 0
+
+    case ":$PATH:" in
+        *":$1:"*) ;;
+        *) PATH="${PATH:+$PATH:}$1" ;;
+    esac
+}
+
+add_library_path()
+{
+    [ -d "$1" ] || return 0
+
+    case ":${LD_LIBRARY_PATH:-}:" in
+        *":$1:"*) ;;
+        *) LD_LIBRARY_PATH="${LD_LIBRARY_PATH:+$LD_LIBRARY_PATH:}$1" ;;
+    esac
+}
+
+add_path /usr/local/bin
+add_library_path /usr/local/lib
+
+export PATH
+export LD_LIBRARY_PATH
diff --git a/utils.h b/utils.h
index b6c872e2..8aa1cc58 100644
--- a/utils.h
+++ b/utils.h
@@ -115,3 +115,6 @@ static inline void list_del_init(struct list_head *node)
         safe = list_entry(entry->member.next, __typeof__(*entry), member); \
          &entry->member != (head); entry = safe,                           \
         safe = list_entry(safe->member.next, __typeof__(*entry), member))
+
+#define LIST_HEAD_INIT(name) {.prev = (&name), .next = (&name)}
+#define LIST_HEAD(name) struct list_head name = LIST_HEAD_INIT(name)
diff --git a/vgpu-display.c b/vgpu-display.c
new file mode 100644
index 00000000..e185f37a
--- /dev/null
+++ b/vgpu-display.c
@@ -0,0 +1,289 @@
+#include <stdlib.h>
+
+#include "vgpu-display.h"
+
+/* 'PRIMARY_SET'/'CURSOR_SET' own CPU-frame snapshots, so each queued command
+ * can retain significantly more memory than an input event. Keep this backlog
+ * deliberately small: display updates are lossy and quickly become stale, and
+ * the emulator thread must be able to drop them rather than accumulate a large
+ * queue of old frames.
+ */
+#define VGPU_DISPLAY_CMD_QUEUE_SIZE 64U
+#define VGPU_DISPLAY_CMD_QUEUE_MASK (VGPU_DISPLAY_CMD_QUEUE_SIZE - 1U)
+
+/* Reliable state for plane clear/removal events. The producer advances
+ * 'generation' when the guest detaches a plane. The SDL consumer mirrors the
+ * last applied value in 'consumed_generation'. Frame payloads remain in the
+ * lossy SPSC queue below.
+ */
+struct vgpu_display_plane_clear_state {
+    uint32_t generation;
+    uint32_t consumed_generation;
+};
+
+static struct vgpu_display_plane_clear_state
+    vgpu_display_primary_clear[VIRTIO_GPU_MAX_SCANOUTS];
+static struct vgpu_display_plane_clear_state
+    vgpu_display_cursor_clear[VIRTIO_GPU_MAX_SCANOUTS];
+static uint32_t vgpu_display_scanout_count = 1U;
+
+/* The SPSC queue carries lossy frame/move commands. It's process-wide and
+ * currently assumes one 'virtio-gpu' producer. The GPU backend is the only
+ * producer and the window backend is the only consumer. Commands entering this
+ * bridge carry 'scanout_id' values already validated by the guest-facing
+ * backend; the SDL consumer relies on that internal contract.
+ */
+static struct vgpu_display_cmd
+    vgpu_display_cmd_queue[VGPU_DISPLAY_CMD_QUEUE_SIZE];
+static uint32_t vgpu_display_cmd_head;
+static uint32_t vgpu_display_cmd_tail;
+
+static bool vgpu_display_unavailable;
+
+static bool vgpu_display_is_cmd_stale(const struct vgpu_display_cmd *cmd)
+{
+    switch (cmd->type) {
+    case VGPU_DISPLAY_CMD_PRIMARY_SET:
+        return cmd->generation !=
+               __atomic_load_n(
+                   &vgpu_display_primary_clear[cmd->scanout_id].generation,
+                   __ATOMIC_ACQUIRE);
+    case VGPU_DISPLAY_CMD_CURSOR_SET:
+    case VGPU_DISPLAY_CMD_CURSOR_MOVE:
+        return cmd->generation !=
+               __atomic_load_n(
+                   &vgpu_display_cursor_clear[cmd->scanout_id].generation,
+                   __ATOMIC_ACQUIRE);
+    default:
+        return false;
+    }
+}
+
+static bool vgpu_display_pop_pending_clear_cmd(
+    struct vgpu_display_plane_clear_state *states,
+    enum vgpu_display_cmd_type type,
+    struct vgpu_display_cmd *cmd)
+{
+    uint32_t scanout_count =
+        __atomic_load_n(&vgpu_display_scanout_count, __ATOMIC_ACQUIRE);
+
+    for (uint32_t i = 0; i < scanout_count; i++) {
+        struct vgpu_display_plane_clear_state *state = &states[i];
+        uint32_t generation =
+            __atomic_load_n(&state->generation, __ATOMIC_ACQUIRE);
+
+        if (state->consumed_generation == generation)
+            continue;
+
+        state->consumed_generation = generation;
+
+        *cmd = (struct vgpu_display_cmd) {
+            .type = type,
+            .scanout_id = i,
+            .generation = generation,
+        };
+        return true;
+    }
+
+    return false;
+}
+
+void vgpu_display_set_scanout_count(uint32_t scanout_count)
+{
+    if (scanout_count > VIRTIO_GPU_MAX_SCANOUTS)
+        scanout_count = VIRTIO_GPU_MAX_SCANOUTS;
+
+    __atomic_store_n(&vgpu_display_scanout_count, scanout_count,
+                     __ATOMIC_RELEASE);
+}
+
+void vgpu_display_publish_primary_clear(uint32_t scanout_id)
+{
+    if (__atomic_load_n(&vgpu_display_unavailable, __ATOMIC_ACQUIRE))
+        return;
+
+    __atomic_add_fetch(&vgpu_display_primary_clear[scanout_id].generation, 1U,
+                       __ATOMIC_ACQ_REL);
+}
+
+void vgpu_display_publish_cursor_clear(uint32_t scanout_id)
+{
+    if (__atomic_load_n(&vgpu_display_unavailable, __ATOMIC_ACQUIRE))
+        return;
+
+    __atomic_add_fetch(&vgpu_display_cursor_clear[scanout_id].generation, 1U,
+                       __ATOMIC_ACQ_REL);
+}
+
+static bool vgpu_display_is_cmd_queue_full(void)
+{
+    uint32_t head = __atomic_load_n(&vgpu_display_cmd_head, __ATOMIC_RELAXED);
+    uint32_t tail = __atomic_load_n(&vgpu_display_cmd_tail, __ATOMIC_ACQUIRE);
+    uint32_t next = (head + 1U) & VGPU_DISPLAY_CMD_QUEUE_MASK;
+    return next == tail;
+}
+
+static void vgpu_display_push_cmd(struct vgpu_display_cmd *cmd)
+{
+    uint32_t head = __atomic_load_n(&vgpu_display_cmd_head, __ATOMIC_RELAXED);
+    uint32_t tail = __atomic_load_n(&vgpu_display_cmd_tail, __ATOMIC_ACQUIRE);
+    uint32_t next = (head + 1U) & VGPU_DISPLAY_CMD_QUEUE_MASK;
+
+    /* Keep the producer non-blocking. If the window backend falls behind,
+     * prefer dropping lossy display updates over stalling guest/device
+     * execution on the emulator thread. Clear commands do not use this queue.
+     */
+    if (next == tail) {
+        vgpu_display_release_cmd(cmd);
+        return;
+    }
+
+    vgpu_display_cmd_queue[head] = *cmd;
+    __atomic_store_n(&vgpu_display_cmd_head, next, __ATOMIC_RELEASE);
+}
+
+static bool vgpu_display_pop_queued_cmd(struct vgpu_display_cmd *cmd)
+{
+    uint32_t tail = __atomic_load_n(&vgpu_display_cmd_tail, __ATOMIC_RELAXED);
+    uint32_t head = __atomic_load_n(&vgpu_display_cmd_head, __ATOMIC_ACQUIRE);
+
+    if (tail == head)
+        return false;
+
+    *cmd = vgpu_display_cmd_queue[tail];
+    __atomic_store_n(&vgpu_display_cmd_tail,
+                     (tail + 1U) & VGPU_DISPLAY_CMD_QUEUE_MASK,
+                     __ATOMIC_RELEASE);
+    return true;
+}
+
+void vgpu_display_release_cmd(struct vgpu_display_cmd *cmd)
+{
+    switch (cmd->type) {
+    case VGPU_DISPLAY_CMD_PRIMARY_SET:
+        free(cmd->u.primary_set.payload);
+        break;
+    case VGPU_DISPLAY_CMD_CURSOR_SET:
+        free(cmd->u.cursor_set.payload);
+        break;
+    default:
+        break;
+    }
+}
+
+bool vgpu_display_pop_cmd(struct vgpu_display_cmd *cmd)
+{
+    /* Return true when '*cmd' is filled with a clear command or a valid queued
+     * frame/move command. Stale queued commands are released and skipped;
+     * return false only when no command remains.
+     */
+    for (;;) {
+        /* Check clear command for primary and cursor plane. */
+        if (vgpu_display_pop_pending_clear_cmd(vgpu_display_primary_clear,
+                                               VGPU_DISPLAY_CMD_PRIMARY_CLEAR,
+                                               cmd))
+            return true;
+        if (vgpu_display_pop_pending_clear_cmd(
+                vgpu_display_cursor_clear, VGPU_DISPLAY_CMD_CURSOR_CLEAR, cmd))
+            return true;
+
+        /* Pop the command and check if it is still valid. */
+        if (!vgpu_display_pop_queued_cmd(cmd))
+            return false;
+        if (!vgpu_display_is_cmd_stale(cmd))
+            return true;
+
+        /* Drop invalid command and continue. */
+        vgpu_display_release_cmd(cmd);
+    }
+}
+
+void vgpu_display_set_unavailable(void)
+{
+    struct vgpu_display_cmd cmd;
+
+    /* This is an init-only fallback path for 'window-sw' initialization
+     * failure, before the emulator thread starts publishing display commands.
+     * It is not a concurrent shutdown primitive: a producer could otherwise
+     * observe 'vgpu_display_unavailable == false', race with this drain, and
+     * enqueue a payload after the queue was already drained.
+     *
+     * Still publish the latch atomically so later call sites keep the same
+     * one-way handoff rule.
+     */
+    __atomic_store_n(&vgpu_display_unavailable, true, __ATOMIC_RELEASE);
+
+    while (vgpu_display_pop_cmd(&cmd))
+        vgpu_display_release_cmd(&cmd);
+}
+
+bool vgpu_display_can_publish(void)
+{
+    return !__atomic_load_n(&vgpu_display_unavailable, __ATOMIC_ACQUIRE) &&
+           !vgpu_display_is_cmd_queue_full();
+}
+
+void vgpu_display_publish_primary_set(uint32_t scanout_id,
+                                      struct vgpu_display_payload *payload)
+{
+    if (__atomic_load_n(&vgpu_display_unavailable, __ATOMIC_ACQUIRE)) {
+        free(payload);
+        return;
+    }
+
+    struct vgpu_display_cmd cmd = {
+        .type = VGPU_DISPLAY_CMD_PRIMARY_SET,
+        .scanout_id = scanout_id,
+        .generation =
+            __atomic_load_n(&vgpu_display_primary_clear[scanout_id].generation,
+                            __ATOMIC_ACQUIRE),
+        .u.primary_set = {.payload = payload},
+    };
+    vgpu_display_push_cmd(&cmd);
+}
+
+void vgpu_display_publish_cursor_set(uint32_t scanout_id,
+                                     struct vgpu_display_payload *payload,
+                                     int32_t x,
+                                     int32_t y,
+                                     uint32_t hot_x,
+                                     uint32_t hot_y)
+{
+    if (__atomic_load_n(&vgpu_display_unavailable, __ATOMIC_ACQUIRE)) {
+        free(payload);
+        return;
+    }
+
+    struct vgpu_display_cmd cmd = {
+        .type = VGPU_DISPLAY_CMD_CURSOR_SET,
+        .scanout_id = scanout_id,
+        .generation =
+            __atomic_load_n(&vgpu_display_cursor_clear[scanout_id].generation,
+                            __ATOMIC_ACQUIRE),
+        .u.cursor_set =
+            {
+                .payload = payload,
+                .x = x,
+                .y = y,
+                .hot_x = hot_x,
+                .hot_y = hot_y,
+            },
+    };
+    vgpu_display_push_cmd(&cmd);
+}
+
+void vgpu_display_publish_cursor_move(uint32_t scanout_id, int32_t x, int32_t y)
+{
+    if (__atomic_load_n(&vgpu_display_unavailable, __ATOMIC_ACQUIRE))
+        return;
+
+    struct vgpu_display_cmd cmd = {
+        .type = VGPU_DISPLAY_CMD_CURSOR_MOVE,
+        .scanout_id = scanout_id,
+        .generation =
+            __atomic_load_n(&vgpu_display_cursor_clear[scanout_id].generation,
+                            __ATOMIC_ACQUIRE),
+        .u.cursor_move = {.x = x, .y = y},
+    };
+    vgpu_display_push_cmd(&cmd);
+}
diff --git a/vgpu-display.h b/vgpu-display.h
new file mode 100644
index 00000000..6eb73a25
--- /dev/null
+++ b/vgpu-display.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#if !SEMU_HAS(VIRTIOGPU)
+#error Only valid when Virtio-GPU is enabled.
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include "virtio-gpu.h"
+
+/* Immutable CPU-frame payload published by the VirtIO GPU backend and later
+ * consumed by the window backend when it uploads pixels into its own textures.
+ */
+struct vgpu_display_cpu_payload {
+    enum virtio_gpu_formats format;
+    uint32_t width, height;
+    uint32_t stride;
+    uint32_t bits_per_pixel;
+    uint8_t *pixels;
+};
+
+/* Owning payload object passed through the display queue. The bridge queues
+ * and disposes this object, while GPU and window backends only fill or
+ * consume the payload it carries.
+ */
+struct vgpu_display_payload {
+    struct vgpu_display_cpu_payload cpu;
+    /* TODO: Add a GL/virgl payload when 3D scanout is implemented. The display
+     * bridge currently transports CPU-owned 2D frames only.
+     */
+};
+
+/* Runtime display commands published by the GPU backend and consumed by the
+ * window backend. 'PRIMARY_*' updates the main scanout image, while 'CURSOR_*'
+ * updates or moves the separate cursor plane.
+ *
+ * Clear commands are reliable generation changes, frame/move commands are lossy
+ * SPSC queue entries.
+ */
+enum vgpu_display_cmd_type {
+    VGPU_DISPLAY_CMD_PRIMARY_SET = 0,
+    VGPU_DISPLAY_CMD_PRIMARY_CLEAR,
+    VGPU_DISPLAY_CMD_CURSOR_SET,
+    VGPU_DISPLAY_CMD_CURSOR_CLEAR,
+    VGPU_DISPLAY_CMD_CURSOR_MOVE,
+};
+
+/* One synthesized display bridge command. 'scanout_id' selects which scanout
+ * to update, and the union carries the payload or coordinates required by the
+ * specific command type above.
+ */
+struct vgpu_display_cmd {
+    enum vgpu_display_cmd_type type;
+    uint32_t scanout_id;
+    uint32_t generation;
+    union {
+        struct {
+            struct vgpu_display_payload *payload;
+        } primary_set;
+        struct {
+            struct vgpu_display_payload *payload;
+            int32_t x;
+            int32_t y;
+            uint32_t hot_x;
+            uint32_t hot_y;
+        } cursor_set;
+        struct {
+            int32_t x;
+            int32_t y;
+        } cursor_move;
+    } u;
+};
+
+void vgpu_display_set_scanout_count(uint32_t scanout_count);
+void vgpu_display_publish_primary_clear(uint32_t scanout_id);
+void vgpu_display_publish_cursor_clear(uint32_t scanout_id);
+
+void vgpu_display_release_cmd(struct vgpu_display_cmd *cmd);
+bool vgpu_display_pop_cmd(struct vgpu_display_cmd *cmd);
+void vgpu_display_set_unavailable(void);
+bool vgpu_display_can_publish(void);
+void vgpu_display_publish_primary_set(uint32_t scanout_id,
+                                      struct vgpu_display_payload *payload);
+void vgpu_display_publish_cursor_set(uint32_t scanout_id,
+                                     struct vgpu_display_payload *payload,
+                                     int32_t x,
+                                     int32_t y,
+                                     uint32_t hot_x,
+                                     uint32_t hot_y);
+void vgpu_display_publish_cursor_move(uint32_t scanout_id,
+                                      int32_t x,
+                                      int32_t y);
diff --git a/virtio-gpu-sw.c b/virtio-gpu-sw.c
new file mode 100644
index 00000000..8096539a
--- /dev/null
+++ b/virtio-gpu-sw.c
@@ -0,0 +1,1243 @@
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/uio.h>
+
+#include "device.h"
+#include "utils.h"
+#include "vgpu-display.h"
+#include "virtio-gpu.h"
+#include "virtio.h"
+
+#define PRIV(x) ((virtio_gpu_data_t *) x->priv)
+
+/* Host-side images are allocated per resource with 'calloc()'. Track their
+ * aggregate size and cap it at 256 MiB.
+ *
+ * Backing entries describe guest RAM ranges. Use 4 KiB as the expected minimum
+ * page granularity, so 512 MiB guest RAM needs at most 'RAM_SIZE / 4096'
+ * entries plus one extra entry for an unaligned tail.
+ */
+#define VGPU_SW_MAX_HOSTMEM (256U * 1024U * 1024U)
+#define VGPU_SW_BACKING_ENTRY_PAGE_SIZE 4096U
+#define VGPU_SW_MAX_BACKING_ENTRIES \
+    (RAM_SIZE / VGPU_SW_BACKING_ENTRY_PAGE_SIZE + 1U)
+
+/* Host-side 2D resource owned by the software backend. It keeps the copied
+ * 'image' plus any attached guest backing metadata needed by transfers.
+ */
+struct vgpu_sw_resource_2d {
+    uint32_t resource_id;
+    uint32_t format;
+    uint32_t width, height;
+    uint32_t stride;
+    uint32_t bits_per_pixel;
+    uint32_t *image;
+    size_t image_size;
+    size_t page_cnt;
+    struct iovec *iovec;
+    struct list_head list;
+};
+
+/* Process-wide singleton: semu currently assumes at most one software
+ * virtio-gpu backend instance per process.
+ */
+static LIST_HEAD(g_vgpu_sw_res_2d_list);
+static size_t g_vgpu_sw_hostmem;
+
+static size_t vgpu_sw_iov_to_buf(const struct iovec *iov,
+                                 unsigned int iov_cnt,
+                                 size_t offset,
+                                 void *buf,
+                                 size_t bytes)
+{
+    size_t done = 0;
+
+    if (bytes == 0)
+        return 0;
+
+    /* Each non-empty 'iovec' entry is validated by 'RESOURCE_ATTACH_BACKING'
+     * before it is stored here. Treat the array as one long byte stream: skip
+     * whole entries until reaching the starting offset, then copy chunks into
+     * 'buf'.
+     */
+    for (unsigned int i = 0; i < iov_cnt; i++) {
+        if (iov[i].iov_len == 0)
+            continue;
+        /* Treat a malformed backing entry as an incomplete copy. */
+        if (!iov[i].iov_base)
+            return done;
+
+        if (offset < iov[i].iov_len) {
+            size_t remained = bytes - done;
+            size_t page_avail = iov[i].iov_len - offset;
+            size_t len = (remained < page_avail) ? remained : page_avail;
+            void *src = (void *) ((uintptr_t) iov[i].iov_base + offset);
+            void *dest = (void *) ((uintptr_t) buf + done);
+
+            memcpy(dest, src, len);
+            offset = 0;
+            done += len;
+
+            if (done >= bytes)
+                break;
+        } else {
+            offset -= iov[i].iov_len;
+        }
+    }
+
+    return done;
+}
+
+static bool vgpu_sw_u64_add_overflow(uint64_t a, uint64_t b, uint64_t *out)
+{
+    *out = a + b;
+    return *out < a;
+}
+
+static bool vgpu_sw_u64_mul_overflow(uint64_t a, uint64_t b, uint64_t *out)
+{
+    if (a != 0 && b > UINT64_MAX / a)
+        return true;
+
+    *out = a * b;
+    return false;
+}
+
+static bool vgpu_sw_rect_fits(uint32_t width,
+                              uint32_t height,
+                              const struct virtio_gpu_rect *rect)
+{
+    if (rect->width == 0 || rect->height == 0)
+        return false;
+    if (rect->x >= width || rect->y >= height)
+        return false;
+
+    return rect->width <= width - rect->x && rect->height <= height - rect->y;
+}
+
+static bool vgpu_sw_transfer_source_fits(
+    const struct virtio_gpu_trans_to_host_2d *req,
+    const struct vgpu_sw_resource_2d *res_2d)
+{
+    uint64_t bpp = res_2d->bits_per_pixel / 8;
+    uint64_t row_bytes, row_stride, last_row, last_row_offset, end_offset;
+    uint64_t required_bytes, backing_size = 0, backing_end;
+
+    if (req->r.height == 0 || req->offset > SIZE_MAX)
+        return false;
+    if (vgpu_sw_u64_mul_overflow(req->r.width, bpp, &row_bytes) ||
+        row_bytes == 0)
+        return false;
+    if (vgpu_sw_u64_mul_overflow((uint64_t) res_2d->stride, req->r.height - 1,
+                                 &last_row))
+        return false;
+    if (vgpu_sw_u64_add_overflow(req->offset, last_row, &last_row_offset))
+        return false;
+    if (vgpu_sw_u64_add_overflow(last_row_offset, row_bytes, &end_offset))
+        return false;
+    if (vgpu_sw_u64_mul_overflow((uint64_t) res_2d->stride, req->r.height,
+                                 &row_stride))
+        return false;
+
+    required_bytes =
+        row_bytes == res_2d->stride ? row_stride : end_offset - req->offset;
+    for (size_t i = 0; i < res_2d->page_cnt; i++) {
+        if (vgpu_sw_u64_add_overflow(backing_size, res_2d->iovec[i].iov_len,
+                                     &backing_size))
+            return false;
+    }
+
+    return !vgpu_sw_u64_add_overflow(req->offset, required_bytes,
+                                     &backing_end) &&
+           backing_end <= backing_size;
+}
+
+static bool vgpu_sw_copy_image_from_pages(
+    struct virtio_gpu_trans_to_host_2d *req,
+    struct vgpu_sw_resource_2d *res_2d)
+{
+    uint32_t stride = res_2d->stride;
+    uint32_t bpp = res_2d->bits_per_pixel / 8; /* Bytes per pixel */
+    uint32_t width = req->r.width;
+    uint32_t height = req->r.height;
+
+    /* When the transfer spans full-width rows with no padding, both source
+     * ('iovec' at 'req->offset') and destination ('image' at 'r.y') are
+     * contiguous, so the entire rectangle can be copied in a single helper
+     * call. This covers all cursor transfers, full-frame updates, and
+     * full-width dirty bands.
+     */
+    if (req->r.x == 0 && (size_t) width * bpp == stride) {
+        void *dest =
+            (void *) ((uintptr_t) res_2d->image + (size_t) req->r.y * stride);
+        size_t bytes = (size_t) stride * height;
+        return vgpu_sw_iov_to_buf(res_2d->iovec, res_2d->page_cnt,
+                                  (size_t) req->offset, dest, bytes) == bytes;
+    }
+
+    /* Partial-width sub-rect: copy row by row */
+    for (uint32_t h = 0; h < height; h++) {
+        /* Source offset is in the image coordinate. The address to copy from
+         * is the page base address plus the offset.
+         */
+        size_t src_offset = req->offset + (size_t) stride * h;
+        size_t dest_offset =
+            ((size_t) req->r.y + h) * stride + (size_t) req->r.x * bpp;
+        void *dest = (void *) ((uintptr_t) res_2d->image + dest_offset);
+        size_t total = (size_t) width * bpp;
+
+        if (vgpu_sw_iov_to_buf(res_2d->iovec, res_2d->page_cnt, src_offset,
+                               dest, total) != total)
+            return false;
+    }
+
+    return true;
+}
+
+static void vgpu_sw_destroy_resource_2d(struct vgpu_sw_resource_2d *res_2d)
+{
+    list_del(&res_2d->list);
+    g_vgpu_sw_hostmem -= res_2d->image_size;
+    free(res_2d->image);
+    free(res_2d->iovec);
+    free(res_2d);
+}
+
+static struct vgpu_sw_resource_2d *vgpu_sw_get_resource_2d(uint32_t resource_id)
+{
+    struct vgpu_sw_resource_2d *res_2d;
+    list_for_each_entry (res_2d, &g_vgpu_sw_res_2d_list, list) {
+        if (res_2d->resource_id == resource_id)
+            return res_2d;
+    }
+
+    return NULL;
+}
+
+static const struct virtq_desc *vgpu_sw_get_response_desc(
+    struct virtq_desc *vq_desc,
+    size_t response_size,
+    uint32_t *plen)
+{
+    int resp_idx = virtio_gpu_get_response_desc(vq_desc, VIRTIO_GPU_MAX_DESC,
+                                                response_size);
+    if (resp_idx >= 0)
+        return &vq_desc[resp_idx];
+
+    *plen = 0;
+    return NULL;
+}
+
+static struct virtio_gpu_scanout_info *vgpu_sw_get_scanout(
+    virtio_gpu_state_t *vgpu,
+    uint32_t scanout_id)
+{
+    if (scanout_id >= PRIV(vgpu)->num_scanouts)
+        return NULL;
+
+    struct virtio_gpu_scanout_info *scanout = &PRIV(vgpu)->scanouts[scanout_id];
+    return scanout->enabled ? scanout : NULL;
+}
+
+static struct vgpu_display_payload *vgpu_sw_create_window_payload(
+    const struct vgpu_sw_resource_2d *res_2d,
+    const struct virtio_gpu_scanout_info *scanout,
+    const char *plane_name)
+{
+    if (!res_2d || !res_2d->image) {
+        fprintf(stderr, VIRTIO_GPU_LOG_PREFIX "%s(): missing %s image\n",
+                __func__, plane_name);
+        return NULL;
+    }
+
+    if (res_2d->bits_per_pixel == 0 || (res_2d->bits_per_pixel % 8) != 0) {
+        fprintf(stderr, VIRTIO_GPU_LOG_PREFIX "%s(): invalid %s bpp %u\n",
+                __func__, plane_name, res_2d->bits_per_pixel);
+        return NULL;
+    }
+
+    size_t bytes_per_pixel = res_2d->bits_per_pixel / 8;
+    uint32_t src_x = 0;
+    uint32_t src_y = 0;
+    uint32_t width = res_2d->width;
+    uint32_t height = res_2d->height;
+    if (scanout) {
+        /* Primary scanouts can expose only a sub-rectangle of the resource.
+         * Record that view before snapshotting it.
+         */
+        src_x = scanout->src_x;
+        src_y = scanout->src_y;
+        width = scanout->src_w;
+        height = scanout->src_h;
+    }
+
+    if (width == 0 || height == 0) {
+        fprintf(stderr, VIRTIO_GPU_LOG_PREFIX "%s(): invalid %s size %ux%u\n",
+                __func__, plane_name, width, height);
+        return NULL;
+    }
+
+    size_t row_bytes = (size_t) width * bytes_per_pixel;
+    if (row_bytes / width != bytes_per_pixel) {
+        fprintf(stderr, VIRTIO_GPU_LOG_PREFIX "%s(): %s row size overflow\n",
+                __func__, plane_name);
+        return NULL;
+    }
+    if (row_bytes > UINT32_MAX) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX "%s(): %s row size exceeds uint32_t\n",
+                __func__, plane_name);
+        return NULL;
+    }
+    if (res_2d->stride < row_bytes) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): invalid %s stride %u for row size %zu\n",
+                __func__, plane_name, res_2d->stride, row_bytes);
+        return NULL;
+    }
+
+    size_t pixels_size = row_bytes * height;
+    if (pixels_size / height != row_bytes) {
+        fprintf(stderr, VIRTIO_GPU_LOG_PREFIX "%s(): %s image size overflow\n",
+                __func__, plane_name);
+        return NULL;
+    }
+
+    size_t alloc_size = sizeof(struct vgpu_display_payload) + pixels_size;
+    if (alloc_size < pixels_size) {
+        fprintf(stderr, VIRTIO_GPU_LOG_PREFIX "%s(): %s allocation overflow\n",
+                __func__, plane_name);
+        return NULL;
+    }
+
+    struct vgpu_display_payload *payload = malloc(alloc_size);
+    if (!payload) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX "%s(): failed to allocate %s snapshot\n",
+                __func__, plane_name);
+        return NULL;
+    }
+
+    payload->cpu.format = res_2d->format;
+    payload->cpu.width = width;
+    payload->cpu.height = height;
+    payload->cpu.stride = (uint32_t) row_bytes;
+    payload->cpu.bits_per_pixel = res_2d->bits_per_pixel;
+    payload->cpu.pixels = (uint8_t *) (payload + 1);
+
+    /* The cropped view is contiguous only when the source stride matches this
+     * snapshot's row size. Otherwise each source row still carries padding or
+     * untouched pixels outside the requested view, so the snapshot must be
+     * packed row by row.
+     */
+    const uint8_t *src_pixels = (const uint8_t *) res_2d->image +
+                                (size_t) src_y * res_2d->stride +
+                                (size_t) src_x * bytes_per_pixel;
+    if (res_2d->stride == row_bytes) {
+        memcpy(payload->cpu.pixels, src_pixels, pixels_size);
+    } else {
+        for (uint32_t y = 0; y < height; y++) {
+            memcpy(payload->cpu.pixels + (size_t) y * row_bytes,
+                   src_pixels + (size_t) y * res_2d->stride, row_bytes);
+        }
+    }
+
+    return payload;
+}
+
+/* Backend Implementation */
+static void vgpu_sw_reset(virtio_gpu_state_t *vgpu)
+{
+    /* The display queue may still hold older 'PRIMARY_SET' / 'CURSOR_SET'
+     * frames published before this reset. Publishing 'CLEAR' advances the
+     * per-plane generation; 'vgpu_display_pop_cmd()' consumes those clears
+     * first, then drops older queued frame commands as stale.
+     *
+     * Queued frame payloads are deep copies, so destroying resources after the
+     * clear publication cannot dangle any display payload still in the bridge.
+     * The display queue is SPSC and consumer-owned, so reset does not drain it
+     * from the producer side. The bounded queue releases stale payloads when
+     * the SDL consumer pops them.
+     */
+    for (uint32_t i = 0; i < PRIV(vgpu)->num_scanouts; i++) {
+        PRIV(vgpu)->scanouts[i].primary_resource_id = 0;
+        PRIV(vgpu)->scanouts[i].cursor_resource_id = 0;
+        PRIV(vgpu)->scanouts[i].src_x = 0;
+        PRIV(vgpu)->scanouts[i].src_y = 0;
+        PRIV(vgpu)->scanouts[i].src_w = 0;
+        PRIV(vgpu)->scanouts[i].src_h = 0;
+        vgpu_display_publish_primary_clear(i);
+        vgpu_display_publish_cursor_clear(i);
+    }
+
+    struct list_head *curr, *next;
+    list_for_each_safe (curr, next, &g_vgpu_sw_res_2d_list) {
+        struct vgpu_sw_resource_2d *res_2d =
+            list_entry(curr, struct vgpu_sw_resource_2d, list);
+
+        vgpu_sw_destroy_resource_2d(res_2d);
+    }
+}
+
+static void vgpu_sw_resource_create_2d_handler(virtio_gpu_state_t *vgpu,
+                                               struct virtq_desc *vq_desc,
+                                               uint32_t *plen)
+{
+    const struct virtq_desc *response_desc = vgpu_sw_get_response_desc(
+        vq_desc, sizeof(struct virtio_gpu_ctrl_hdr), plen);
+    if (!response_desc)
+        return;
+
+    struct virtio_gpu_res_create_2d *request = virtio_gpu_get_request(
+        vgpu, vq_desc, sizeof(struct virtio_gpu_res_create_2d));
+    if (!request) {
+        virtio_gpu_set_fail(vgpu);
+        *plen = 0;
+        return;
+    }
+
+    /* Keep 'resource_id' 0 unavailable for real resources. The virtio spec
+     * explicitly documents 'resource_id = 0' as the 'SET_SCANOUT' disable
+     * sentinel.
+     * The Linux virtio-gpu driver also allocates guest-generated resource IDs
+     * as 'handle + 1', so they are always greater than 0. See
+     * 'virtgpu_object.c' for details.
+     */
+    if (request->resource_id == 0) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX "%s(): resource id should not be 0\n",
+                __func__);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &request->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID);
+        return;
+    }
+
+    if (request->width == 0 || request->height == 0) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX "%s(): invalid resource size %ux%u\n",
+                __func__, request->width, request->height);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &request->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER);
+        return;
+    }
+
+    /* Reject re-use of an already-live resource id. Without this check the
+     * guest could orphan the previous resource (its 'image' and 'iovec' would
+     * leak because 'vgpu_sw_get_resource_2d()' returns the first match) and
+     * confuse later 'TRANSFER' / 'FLUSH' / 'UNREF' requests that target the
+     * same id. Spec explicitly allows the device to fail this.
+     */
+    if (vgpu_sw_get_resource_2d(request->resource_id)) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX "%s(): resource id %u already in use\n",
+                __func__, request->resource_id);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &request->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID);
+        return;
+    }
+
+    /* Create 2D resource */
+    struct vgpu_sw_resource_2d *res_2d = calloc(1, sizeof(*res_2d));
+    if (!res_2d) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX "%s(): failed to allocate new resource\n",
+                __func__);
+        virtio_gpu_set_fail(vgpu);
+        *plen = 0;
+        return;
+    }
+    res_2d->resource_id = request->resource_id;
+
+    /* The software backend currently supports only 32bpp packed formats. */
+    uint32_t bits_per_pixel;
+    switch (request->format) {
+    case VIRTIO_GPU_FORMAT_B8G8R8A8_UNORM:
+    case VIRTIO_GPU_FORMAT_B8G8R8X8_UNORM:
+    case VIRTIO_GPU_FORMAT_A8R8G8B8_UNORM:
+    case VIRTIO_GPU_FORMAT_X8R8G8B8_UNORM:
+    case VIRTIO_GPU_FORMAT_R8G8B8A8_UNORM:
+    case VIRTIO_GPU_FORMAT_X8B8G8R8_UNORM:
+    case VIRTIO_GPU_FORMAT_A8B8G8R8_UNORM:
+    case VIRTIO_GPU_FORMAT_R8G8B8X8_UNORM:
+        bits_per_pixel = 32;
+        break;
+    default:
+        fprintf(stderr, VIRTIO_GPU_LOG_PREFIX "%s(): unsupported format %u\n",
+                __func__, request->format);
+        free(res_2d);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &request->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER);
+        return;
+    }
+
+    /* Set 2D resource */
+    res_2d->width = request->width;
+    res_2d->height = request->height;
+    res_2d->format = request->format;
+    res_2d->bits_per_pixel = bits_per_pixel;
+
+    /* Compute the row stride in a wider type first, then narrow it only after
+     * checking the final byte count still fits in 'uint32_t'. Otherwise a large
+     * guest width could wrap during the intermediate multiplication and leave
+     * a truncated stride in the resource.
+     */
+    size_t stride =
+        (((size_t) res_2d->width * res_2d->bits_per_pixel + 0x1f) >> 5) *
+        sizeof(uint32_t);
+    if (stride > UINT32_MAX) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX "%s(): stride overflow (%u x %u bpp)\n",
+                __func__, res_2d->width, res_2d->bits_per_pixel);
+        free(res_2d);
+        *plen =
+            virtio_gpu_write_ctrl_response(vgpu, &request->hdr, response_desc,
+                                           VIRTIO_GPU_RESP_ERR_OUT_OF_MEMORY);
+        return;
+    }
+    res_2d->stride = (uint32_t) stride;
+
+    /* Guard against integer overflow in image buffer allocation.
+     * Both 'stride' and 'height' are guest-controlled 'uint32_t' values whose
+     * product can silently wrap around in 32-bit arithmetic, resulting in
+     * an undersized 'malloc()' while later transfers write to the full extent.
+     */
+    size_t image_size = (size_t) res_2d->stride * res_2d->height;
+    if (res_2d->height && image_size / res_2d->height != res_2d->stride) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX "%s(): image size overflow (%u x %u)\n",
+                __func__, res_2d->width, res_2d->height);
+        free(res_2d);
+        *plen =
+            virtio_gpu_write_ctrl_response(vgpu, &request->hdr, response_desc,
+                                           VIRTIO_GPU_RESP_ERR_OUT_OF_MEMORY);
+        return;
+    }
+
+    if (image_size > VGPU_SW_MAX_HOSTMEM ||
+        g_vgpu_sw_hostmem > VGPU_SW_MAX_HOSTMEM - image_size) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): image memory limit exceeded (%zu bytes)\n",
+                __func__, image_size);
+        free(res_2d);
+        *plen =
+            virtio_gpu_write_ctrl_response(vgpu, &request->hdr, response_desc,
+                                           VIRTIO_GPU_RESP_ERR_OUT_OF_MEMORY);
+        return;
+    }
+
+    res_2d->image = calloc(1, image_size);
+
+    /* Failed to create image buffer */
+    if (!res_2d->image) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): failed to allocate image buffer (%zu bytes)\n",
+                __func__, image_size);
+        free(res_2d);
+        virtio_gpu_set_fail(vgpu);
+        *plen = 0;
+        return;
+    }
+    res_2d->image_size = image_size;
+    g_vgpu_sw_hostmem += image_size;
+    list_push(&res_2d->list, &g_vgpu_sw_res_2d_list);
+
+    *plen = virtio_gpu_write_ctrl_response(vgpu, &request->hdr, response_desc,
+                                           VIRTIO_GPU_RESP_OK_NODATA);
+}
+
+static void vgpu_sw_cmd_resource_unref_handler(virtio_gpu_state_t *vgpu,
+                                               struct virtq_desc *vq_desc,
+                                               uint32_t *plen)
+{
+    const struct virtq_desc *response_desc = vgpu_sw_get_response_desc(
+        vq_desc, sizeof(struct virtio_gpu_ctrl_hdr), plen);
+    if (!response_desc)
+        return;
+
+    struct virtio_gpu_res_unref *request = virtio_gpu_get_request(
+        vgpu, vq_desc, sizeof(struct virtio_gpu_res_unref));
+    if (!request) {
+        virtio_gpu_set_fail(vgpu);
+        *plen = 0;
+        return;
+    }
+
+    struct vgpu_sw_resource_2d *res_2d =
+        vgpu_sw_get_resource_2d(request->resource_id);
+    if (!res_2d) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): resource unref references invalid resource id %u\n",
+                __func__, request->resource_id);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &request->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID);
+        return;
+    }
+
+    /* Clear any visible plane using this resource before it is freed. */
+    for (uint32_t i = 0; i < PRIV(vgpu)->num_scanouts; i++) {
+        struct virtio_gpu_scanout_info *scanout = &PRIV(vgpu)->scanouts[i];
+
+        if (!scanout->enabled)
+            continue;
+
+        if (scanout->primary_resource_id == request->resource_id) {
+            scanout->primary_resource_id = 0;
+            scanout->src_x = scanout->src_y = 0;
+            scanout->src_w = scanout->src_h = 0;
+            vgpu_display_publish_primary_clear(i);
+        }
+
+        if (scanout->cursor_resource_id == request->resource_id) {
+            scanout->cursor_resource_id = 0;
+            vgpu_display_publish_cursor_clear(i);
+        }
+    }
+
+    vgpu_sw_destroy_resource_2d(res_2d);
+
+    *plen = virtio_gpu_write_ctrl_response(vgpu, &request->hdr, response_desc,
+                                           VIRTIO_GPU_RESP_OK_NODATA);
+}
+
+static void vgpu_sw_cmd_set_scanout_handler(virtio_gpu_state_t *vgpu,
+                                            struct virtq_desc *vq_desc,
+                                            uint32_t *plen)
+{
+    const struct virtq_desc *response_desc = vgpu_sw_get_response_desc(
+        vq_desc, sizeof(struct virtio_gpu_ctrl_hdr), plen);
+    if (!response_desc)
+        return;
+
+    struct virtio_gpu_set_scanout *request = virtio_gpu_get_request(
+        vgpu, vq_desc, sizeof(struct virtio_gpu_set_scanout));
+    if (!request) {
+        virtio_gpu_set_fail(vgpu);
+        *plen = 0;
+        return;
+    }
+
+    struct virtio_gpu_scanout_info *scanout =
+        vgpu_sw_get_scanout(vgpu, request->scanout_id);
+    if (!scanout) {
+        fprintf(stderr, VIRTIO_GPU_LOG_PREFIX "%s(): invalid scanout id %u\n",
+                __func__, request->scanout_id);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &request->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_SCANOUT_ID);
+        return;
+    }
+
+    /* Keep 'resource_id' 0 unavailable for real resources. The virtio spec
+     * explicitly documents 'resource_id = 0' as the 'SET_SCANOUT' disable
+     * sentinel.
+     * The Linux virtio-gpu driver also allocates guest-generated resource IDs
+     * as 'handle + 1', so they are always greater than 0. See
+     * 'virtgpu_object.c' for details.
+     */
+    if (request->resource_id == 0) {
+        scanout->primary_resource_id = 0;
+        scanout->src_x = scanout->src_y = 0;
+        scanout->src_w = scanout->src_h = 0;
+        vgpu_display_publish_primary_clear(request->scanout_id);
+        goto leave;
+    }
+
+    /* Retrieve 2D resource */
+    struct vgpu_sw_resource_2d *res_2d =
+        vgpu_sw_get_resource_2d(request->resource_id);
+    if (!res_2d) {
+        fprintf(stderr, VIRTIO_GPU_LOG_PREFIX "%s(): invalid resource id %u\n",
+                __func__, request->resource_id);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &request->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID);
+        return;
+    }
+
+    /* Validate that the source rectangle fits within the resource without
+     * relying on wrapping 32-bit additions.
+     */
+    if (!vgpu_sw_rect_fits(res_2d->width, res_2d->height, &request->r)) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): source rect %u,%u %ux%u exceeds resource %ux%u\n",
+                __func__, request->r.x, request->r.y, request->r.width,
+                request->r.height, res_2d->width, res_2d->height);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &request->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER);
+        return;
+    }
+
+    /* The source rectangle is displayed into this scanout, view size is bounded
+     * by the advertised scanout size.
+     */
+    if (request->r.width > scanout->width ||
+        request->r.height > scanout->height) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): source rect %ux%u exceeds scanout %ux%u\n",
+                __func__, request->r.width, request->r.height, scanout->width,
+                scanout->height);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &request->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER);
+        return;
+    }
+
+    /* Bind scanout with resource and record the source rectangle */
+    scanout->primary_resource_id = res_2d->resource_id;
+    scanout->src_x = request->r.x;
+    scanout->src_y = request->r.y;
+    scanout->src_w = request->r.width;
+    scanout->src_h = request->r.height;
+
+leave:
+    *plen = virtio_gpu_write_ctrl_response(vgpu, &request->hdr, response_desc,
+                                           VIRTIO_GPU_RESP_OK_NODATA);
+}
+
+static void vgpu_sw_cmd_resource_flush_handler(virtio_gpu_state_t *vgpu,
+                                               struct virtq_desc *vq_desc,
+                                               uint32_t *plen)
+{
+    const struct virtq_desc *response_desc = vgpu_sw_get_response_desc(
+        vq_desc, sizeof(struct virtio_gpu_ctrl_hdr), plen);
+    if (!response_desc)
+        return;
+
+    struct virtio_gpu_res_flush *request = virtio_gpu_get_request(
+        vgpu, vq_desc, sizeof(struct virtio_gpu_res_flush));
+    if (!request) {
+        virtio_gpu_set_fail(vgpu);
+        *plen = 0;
+        return;
+    }
+
+    /* Retrieve 2D resource */
+    struct vgpu_sw_resource_2d *res_2d =
+        vgpu_sw_get_resource_2d(request->resource_id);
+    if (!res_2d) {
+        fprintf(stderr, VIRTIO_GPU_LOG_PREFIX "%s(): invalid resource id %u\n",
+                __func__, request->resource_id);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &request->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID);
+        return;
+    }
+
+    if (!vgpu_sw_rect_fits(res_2d->width, res_2d->height, &request->r)) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): invalid flush rect %u,%u %ux%u for resource %u size "
+                "%ux%u\n",
+                __func__, request->r.x, request->r.y, request->r.width,
+                request->r.height, request->resource_id, res_2d->width,
+                res_2d->height);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &request->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER);
+        return;
+    }
+
+    /* Flush the resource to every scanout currently bound to it, using the
+     * source rectangle recorded by 'SET_SCANOUT' to display only the requested
+     * sub-region of the resource.
+     */
+    for (uint32_t i = 0; i < PRIV(vgpu)->num_scanouts; i++) {
+        struct virtio_gpu_scanout_info *scanout = &PRIV(vgpu)->scanouts[i];
+
+        if (!scanout->enabled ||
+            scanout->primary_resource_id != request->resource_id)
+            continue;
+
+        /* Keep the producer non-blocking: if the display queue is full or
+         * snapshot allocation fails below, this flush frame for scanout 'i' is
+         * dropped and the frontend keeps showing its previous published frame.
+         */
+        if (!vgpu_display_can_publish())
+            continue;
+
+        struct vgpu_display_payload *payload =
+            vgpu_sw_create_window_payload(res_2d, scanout, "primary");
+        if (!payload)
+            continue;
+
+        /* The publish path snapshots the whole 'SET_SCANOUT' view for this
+         * scanout. 'request->r' is not used here to further trim the payload
+         * for now.
+         */
+        vgpu_display_publish_primary_set(i, payload);
+    }
+
+    *plen = virtio_gpu_write_ctrl_response(vgpu, &request->hdr, response_desc,
+                                           VIRTIO_GPU_RESP_OK_NODATA);
+}
+
+static void vgpu_sw_cmd_transfer_to_host_2d_handler(virtio_gpu_state_t *vgpu,
+                                                    struct virtq_desc *vq_desc,
+                                                    uint32_t *plen)
+{
+    const struct virtq_desc *response_desc = vgpu_sw_get_response_desc(
+        vq_desc, sizeof(struct virtio_gpu_ctrl_hdr), plen);
+    if (!response_desc)
+        return;
+
+    struct virtio_gpu_trans_to_host_2d *req = virtio_gpu_get_request(
+        vgpu, vq_desc, sizeof(struct virtio_gpu_trans_to_host_2d));
+    if (!req) {
+        virtio_gpu_set_fail(vgpu);
+        *plen = 0;
+        return;
+    }
+
+    /* Retrieve 2D resource */
+    struct vgpu_sw_resource_2d *res_2d =
+        vgpu_sw_get_resource_2d(req->resource_id);
+    if (!res_2d) {
+        fprintf(stderr, VIRTIO_GPU_LOG_PREFIX "%s(): invalid resource id %u\n",
+                __func__, req->resource_id);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &req->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID);
+        return;
+    }
+
+    /* Check if backing has been attached */
+    if (!res_2d->iovec) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): backing not attached for resource %u\n",
+                __func__, req->resource_id);
+        *plen = virtio_gpu_write_ctrl_response(vgpu, &req->hdr, response_desc,
+                                               VIRTIO_GPU_RESP_ERR_UNSPEC);
+        return;
+    }
+
+    /* Validate that the destination rectangle fits within the resource
+     * without relying on wrapping 32-bit additions. Mirrors the check in
+     * 'vgpu_sw_cmd_set_scanout_handler()'.
+     */
+    if (!vgpu_sw_rect_fits(res_2d->width, res_2d->height, &req->r)) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): invalid transfer rect %u,%u %ux%u for resource %u size "
+                "%ux%u\n",
+                __func__, req->r.x, req->r.y, req->r.width, req->r.height,
+                req->resource_id, res_2d->width, res_2d->height);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &req->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER);
+        return;
+    }
+
+    if (!vgpu_sw_transfer_source_fits(req, res_2d)) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX "%s(): transfer source exceeds backing\n",
+                __func__);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &req->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER);
+        return;
+    }
+
+    /* Transfer frame data from guest to host */
+    if (!vgpu_sw_copy_image_from_pages(req, res_2d)) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): incomplete transfer from backing\n",
+                __func__);
+        *plen = virtio_gpu_write_ctrl_response(vgpu, &req->hdr, response_desc,
+                                               VIRTIO_GPU_RESP_ERR_UNSPEC);
+        return;
+    }
+
+    *plen = virtio_gpu_write_ctrl_response(vgpu, &req->hdr, response_desc,
+                                           VIRTIO_GPU_RESP_OK_NODATA);
+}
+
+static void vgpu_sw_cmd_resource_attach_backing_handler(
+    virtio_gpu_state_t *vgpu,
+    struct virtq_desc *vq_desc,
+    uint32_t *plen)
+{
+    const struct virtq_desc *response_desc = vgpu_sw_get_response_desc(
+        vq_desc, sizeof(struct virtio_gpu_ctrl_hdr), plen);
+    if (!response_desc)
+        return;
+
+    struct virtio_gpu_res_attach_backing *backing_info = virtio_gpu_get_request(
+        vgpu, vq_desc, sizeof(struct virtio_gpu_res_attach_backing));
+    if (!backing_info) {
+        virtio_gpu_set_fail(vgpu);
+        *plen = 0;
+        return;
+    }
+
+    if (vq_desc[1].flags & VIRTIO_DESC_F_WRITE) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): backing entries descriptor is writable\n",
+                __func__);
+        virtio_gpu_set_fail(vgpu);
+        *plen = 0;
+        return;
+    }
+
+    if (backing_info->nr_entries == 0 ||
+        backing_info->nr_entries > VGPU_SW_MAX_BACKING_ENTRIES) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX "%s(): invalid backing entry count %u\n",
+                __func__, backing_info->nr_entries);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &backing_info->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER);
+        return;
+    }
+
+    /* The entry cap above keeps 'entries_size' small. semu currently targets
+     * 64-bit hosts, so this path does not guard for 32-bit host overflow yet.
+     */
+    size_t entries_size =
+        sizeof(struct virtio_gpu_mem_entry) * backing_info->nr_entries;
+
+    if (vq_desc[1].len < entries_size) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): backing entries descriptor too small\n",
+                __func__);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &backing_info->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER);
+        return;
+    }
+
+    struct virtio_gpu_mem_entry *pages = virtio_gpu_mem_guest_to_host(
+        vgpu, vq_desc[1].addr, (uint32_t) entries_size);
+    if (!pages) {
+        virtio_gpu_set_fail(vgpu);
+        *plen = 0;
+        return;
+    }
+
+    /* Retrieve 2D resource */
+    struct vgpu_sw_resource_2d *res_2d =
+        vgpu_sw_get_resource_2d(backing_info->resource_id);
+    if (!res_2d) {
+        fprintf(stderr, VIRTIO_GPU_LOG_PREFIX "%s(): invalid resource id %u\n",
+                __func__, backing_info->resource_id);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &backing_info->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID);
+        return;
+    }
+
+    /* Check if backing is already attached */
+    if (res_2d->iovec) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): backing already attached for resource %u\n",
+                __func__, backing_info->resource_id);
+        *plen = virtio_gpu_write_ctrl_response(vgpu, &backing_info->hdr,
+                                               response_desc,
+                                               VIRTIO_GPU_RESP_ERR_UNSPEC);
+        return;
+    }
+
+    /* Dispatch page memories to the 2D resource */
+    res_2d->page_cnt = backing_info->nr_entries;
+    res_2d->iovec = malloc(sizeof(struct iovec) * backing_info->nr_entries);
+    if (!res_2d->iovec) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX "%s(): failed to allocate io vector\n",
+                __func__);
+        virtio_gpu_set_fail(vgpu);
+        *plen = 0;
+        return;
+    }
+
+    /* Convert each guest-provided backing entry into one host-side 'iovec'. */
+    for (size_t i = 0; i < backing_info->nr_entries; i++) {
+        if (pages[i].addr > UINT32_MAX) {
+            fprintf(stderr,
+                    VIRTIO_GPU_LOG_PREFIX "%s(): page %zu addr_high non-zero\n",
+                    __func__, i);
+            free(res_2d->iovec);
+            res_2d->iovec = NULL;
+            res_2d->page_cnt = 0;
+            *plen = virtio_gpu_write_ctrl_response(
+                vgpu, &backing_info->hdr, response_desc,
+                VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER);
+            return;
+        }
+
+        /* Attach address and length of i-th page to the 2D resource. */
+        res_2d->iovec[i].iov_base = virtio_gpu_mem_guest_to_host(
+            vgpu, (uint32_t) pages[i].addr, pages[i].length);
+        res_2d->iovec[i].iov_len = pages[i].length;
+
+        /* Corrupted page address */
+        if (!res_2d->iovec[i].iov_base) {
+            fprintf(stderr,
+                    VIRTIO_GPU_LOG_PREFIX
+                    "%s(): backing entry %zu guest address 0x%llx length %u "
+                    "is out of guest RAM\n",
+                    __func__, i, (unsigned long long) pages[i].addr,
+                    pages[i].length);
+            free(res_2d->iovec);
+            res_2d->iovec = NULL;
+            res_2d->page_cnt = 0;
+            *plen = virtio_gpu_write_ctrl_response(
+                vgpu, &backing_info->hdr, response_desc,
+                VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER);
+            return;
+        }
+    }
+
+    *plen = virtio_gpu_write_ctrl_response(
+        vgpu, &backing_info->hdr, response_desc, VIRTIO_GPU_RESP_OK_NODATA);
+}
+
+static void vgpu_sw_cmd_resource_detach_backing_handler(
+    virtio_gpu_state_t *vgpu,
+    struct virtq_desc *vq_desc,
+    uint32_t *plen)
+{
+    const struct virtq_desc *response_desc = vgpu_sw_get_response_desc(
+        vq_desc, sizeof(struct virtio_gpu_ctrl_hdr), plen);
+    if (!response_desc)
+        return;
+
+    struct virtio_gpu_res_detach_backing *request = virtio_gpu_get_request(
+        vgpu, vq_desc, sizeof(struct virtio_gpu_res_detach_backing));
+    if (!request) {
+        virtio_gpu_set_fail(vgpu);
+        *plen = 0;
+        return;
+    }
+
+    /* Retrieve 2D resource */
+    struct vgpu_sw_resource_2d *res_2d =
+        vgpu_sw_get_resource_2d(request->resource_id);
+
+    if (!res_2d) {
+        fprintf(stderr, VIRTIO_GPU_LOG_PREFIX "%s(): invalid resource id %u\n",
+                __func__, request->resource_id);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &request->hdr, response_desc,
+            VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID);
+        return;
+    }
+
+    /* Check if backing exists */
+    if (!res_2d->iovec) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX "%s(): no backing for resource %u\n",
+                __func__, request->resource_id);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &request->hdr, response_desc, VIRTIO_GPU_RESP_ERR_UNSPEC);
+        return;
+    }
+
+    /* Detach backing and free the 'iovec' array. */
+    free(res_2d->iovec);
+    res_2d->iovec = NULL;
+    res_2d->page_cnt = 0;
+
+    *plen = virtio_gpu_write_ctrl_response(vgpu, &request->hdr, response_desc,
+                                           VIRTIO_GPU_RESP_OK_NODATA);
+}
+
+static int32_t vgpu_sw_decode_cursor_coord(uint32_t coord)
+{
+    /* Linux passes signed cursor plane 'crtc_x'/'crtc_y' through virtio-gpu's
+     * unsigned 32-bit wire fields. Decode that two's-complement value
+     * explicitly instead of relying on implementation-defined signed casts:
+     * values above 'INT32_MAX' represent negative coordinates, so subtract
+     * 2^32 to recover them, e.g. '0xffffffff' -> -1 and '0xfffffffe' -> -2.
+     */
+    if (coord <= (uint32_t) INT32_MAX)
+        return (int32_t) coord;
+    return (int32_t) ((int64_t) coord - ((int64_t) UINT32_MAX + 1));
+}
+
+static void vgpu_sw_cmd_update_cursor_handler(virtio_gpu_state_t *vgpu,
+                                              struct virtq_desc *vq_desc,
+                                              uint32_t *plen)
+{
+    struct virtio_gpu_update_cursor *cursor = virtio_gpu_get_request(
+        vgpu, vq_desc, sizeof(struct virtio_gpu_update_cursor));
+    if (!cursor) {
+        *plen = 0;
+        return;
+    }
+
+    /* Normal cursorq commands have no response descriptor. Current Linux sends
+     * cursor buffers through 'virtio_gpu_queue_cursor()' without fencing and
+     * with a single out descriptor, so keep this path unfenced-only for now.
+     *
+     * TODO: Support fenced cursor commands by handling a response descriptor,
+     * echoing the fence id, and auditing every cursor success/error path to
+     * emit a proper control response instead of len=0.
+     */
+    if (cursor->hdr.flags & VIRTIO_GPU_FLAG_FENCE)
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): fenced cursor command is unsupported\n",
+                __func__);
+
+    struct virtio_gpu_scanout_info *scanout =
+        vgpu_sw_get_scanout(vgpu, cursor->pos.scanout_id);
+    if (!scanout) {
+        *plen = 0;
+        return;
+    }
+
+    /* Keep 'resource_id' 0 unavailable for real resources. The virtio spec
+     * explicitly documents 'resource_id = 0' as the 'SET_SCANOUT' disable
+     * sentinel.
+     * The Linux virtio-gpu driver also allocates guest-generated resource IDs
+     * as 'handle + 1', so they are always greater than 0. See
+     * 'virtgpu_object.c' for details.
+     */
+    if (cursor->resource_id == 0) {
+        scanout->cursor_resource_id = 0;
+        vgpu_display_publish_cursor_clear(cursor->pos.scanout_id);
+        *plen = 0;
+        return;
+    }
+
+    /* Update cursor image */
+    struct vgpu_sw_resource_2d *res_2d =
+        vgpu_sw_get_resource_2d(cursor->resource_id);
+    if (!res_2d) {
+        fprintf(stderr, VIRTIO_GPU_LOG_PREFIX "%s(): invalid resource id %u\n",
+                __func__, cursor->resource_id);
+        *plen = 0;
+        return;
+    }
+
+    if (res_2d->width == 0 || res_2d->height == 0 ||
+        res_2d->width > scanout->width || res_2d->height > scanout->height) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX "%s(): invalid cursor size %ux%u\n",
+                __func__, res_2d->width, res_2d->height);
+        *plen = 0;
+        return;
+    }
+
+    /* Cursor commands have no response. If publication would drop this frame,
+     * keep 'cursor_resource_id' unchanged because it tracks the cursor that is
+     * still visible and is used by RESOURCE_UNREF to decide whether to publish
+     * a clear.
+     */
+    if (!vgpu_display_can_publish()) {
+        *plen = 0;
+        return;
+    }
+
+    struct vgpu_display_payload *payload =
+        vgpu_sw_create_window_payload(res_2d, NULL, "cursor");
+    if (!payload) {
+        /* Allocation failure has the same visible result as a dropped
+         * publication: keep the old cursor binding.
+         */
+        *plen = 0;
+        return;
+    }
+    scanout->cursor_resource_id = cursor->resource_id;
+    vgpu_display_publish_cursor_set(cursor->pos.scanout_id, payload,
+                                    vgpu_sw_decode_cursor_coord(cursor->pos.x),
+                                    vgpu_sw_decode_cursor_coord(cursor->pos.y),
+                                    cursor->hot_x, cursor->hot_y);
+
+    *plen = 0;
+}
+
+static void vgpu_sw_cmd_move_cursor_handler(virtio_gpu_state_t *vgpu,
+                                            struct virtq_desc *vq_desc,
+                                            uint32_t *plen)
+{
+    struct virtio_gpu_update_cursor *cursor = virtio_gpu_get_request(
+        vgpu, vq_desc, sizeof(struct virtio_gpu_update_cursor));
+    if (!cursor) {
+        *plen = 0;
+        return;
+    }
+
+    /* Normal cursorq commands have no response descriptor. Current Linux sends
+     * cursor buffers through 'virtio_gpu_queue_cursor()' without fencing and
+     * with a single out descriptor, so keep this path unfenced-only for now.
+     *
+     * TODO: Support fenced cursor commands by handling a response descriptor,
+     * echoing the fence id, and auditing every cursor success/error path to
+     * emit a proper control response instead of len=0.
+     */
+    if (cursor->hdr.flags & VIRTIO_GPU_FLAG_FENCE)
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): fenced cursor command is unsupported\n",
+                __func__);
+
+    if (!vgpu_sw_get_scanout(vgpu, cursor->pos.scanout_id)) {
+        *plen = 0;
+        return;
+    }
+
+    /* Move cursor to new position */
+    vgpu_display_publish_cursor_move(
+        cursor->pos.scanout_id, vgpu_sw_decode_cursor_coord(cursor->pos.x),
+        vgpu_sw_decode_cursor_coord(cursor->pos.y));
+
+    *plen = 0;
+}
+
+/* The software backend supports only CPU-backed 2D scanout resources today.
+ * Optional virtio-gpu features for capsets, resource UUIDs, blob resources,
+ * virgl/3D contexts, and blob mappings intentionally stay routed to
+ * 'VIRTIO_GPU_CMD_UNDEF' so unsupported guest paths fail explicitly.
+ *
+ * TODO: Implement these handlers after the feature bits, backend resource
+ * model, and display payload path grow matching virgl/blob support.
+ */
+const struct virtio_gpu_cmd_backend g_virtio_gpu_backend = {
+    .reset = vgpu_sw_reset,
+    .get_display_info = virtio_gpu_get_display_info_handler,
+    .resource_create_2d = vgpu_sw_resource_create_2d_handler,
+    .resource_unref = vgpu_sw_cmd_resource_unref_handler,
+    .set_scanout = vgpu_sw_cmd_set_scanout_handler,
+    .resource_flush = vgpu_sw_cmd_resource_flush_handler,
+    .transfer_to_host_2d = vgpu_sw_cmd_transfer_to_host_2d_handler,
+    .resource_attach_backing = vgpu_sw_cmd_resource_attach_backing_handler,
+    .resource_detach_backing = vgpu_sw_cmd_resource_detach_backing_handler,
+    .get_capset_info = VIRTIO_GPU_CMD_UNDEF,
+    .get_capset = VIRTIO_GPU_CMD_UNDEF,
+    .get_edid = virtio_gpu_get_edid_handler,
+    .resource_assign_uuid = VIRTIO_GPU_CMD_UNDEF,
+    .resource_create_blob = VIRTIO_GPU_CMD_UNDEF,
+    .set_scanout_blob = VIRTIO_GPU_CMD_UNDEF,
+    .ctx_create = VIRTIO_GPU_CMD_UNDEF,
+    .ctx_destroy = VIRTIO_GPU_CMD_UNDEF,
+    .ctx_attach_resource = VIRTIO_GPU_CMD_UNDEF,
+    .ctx_detach_resource = VIRTIO_GPU_CMD_UNDEF,
+    .resource_create_3d = VIRTIO_GPU_CMD_UNDEF,
+    .transfer_to_host_3d = VIRTIO_GPU_CMD_UNDEF,
+    .transfer_from_host_3d = VIRTIO_GPU_CMD_UNDEF,
+    .submit_3d = VIRTIO_GPU_CMD_UNDEF,
+    .resource_map_blob = VIRTIO_GPU_CMD_UNDEF,
+    .resource_unmap_blob = VIRTIO_GPU_CMD_UNDEF,
+    .update_cursor = vgpu_sw_cmd_update_cursor_handler,
+    .move_cursor = vgpu_sw_cmd_move_cursor_handler,
+};
diff --git a/virtio-gpu.c b/virtio-gpu.c
new file mode 100644
index 00000000..35c23d2f
--- /dev/null
+++ b/virtio-gpu.c
@@ -0,0 +1,1184 @@
+#include <fcntl.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "device.h"
+#include "riscv.h"
+#include "riscv_private.h"
+#include "utils.h"
+#include "virtio-gpu.h"
+#include "virtio.h"
+
+#define VIRTIO_GPU_CMD_TRACE_ENABLED 0
+
+#define VIRTIO_F_VERSION_1 1
+
+#define VIRTIO_GPU_EVENT_DISPLAY (1 << 0)
+#define VIRTIO_GPU_F_EDID (1 << 1)
+#define VIRTIO_GPU_F_CONTEXT_INIT (1 << 4)
+
+#define VIRTIO_GPU_QUEUE_NUM_MAX 1024
+#define VIRTIO_GPU_QUEUE (vgpu->queues[vgpu->QueueSel])
+#define VIRTIO_GPU_CONTROLQ 0
+#define VIRTIO_GPU_CURSORQ 1
+
+/* DMT usage macro */
+#define EDID_BLOCK_SIZE 128U
+#define DMT_BASE_WIDTH 1024U
+#define DMT_BASE_HEIGHT 768U
+#define DMT_BASE_PIXEL_CLOCK_10KHZ 6500U
+#define DMT_BASE_H_BLANK 320U
+#define DMT_BASE_H_FRONT 24U
+#define DMT_BASE_H_SYNC 136U
+#define DMT_BASE_V_BLANK 38U
+#define DMT_BASE_V_FRONT 3U
+#define DMT_BASE_V_SYNC 6U
+#define DMT_BOUND_FIELD(field, max) \
+    do {                            \
+        if ((field) > (max))        \
+            (field) = (max);        \
+    } while (0)
+
+#define PRIV(x) ((virtio_gpu_data_t *) x->priv)
+
+#if VIRTIO_GPU_CMD_TRACE_ENABLED
+#define VIRTIO_GPU_CMD_CASE(cmd, fn)                                 \
+    case VIRTIO_GPU_CMD_##cmd:                                       \
+        printf("(*) semu/virtio-gpu: %s\n", "VIRTIO_GPU_CMD_" #cmd); \
+        g_virtio_gpu_backend.fn(vgpu, vq_desc, plen);                \
+        break;
+#else
+#define VIRTIO_GPU_CMD_CASE(cmd, fn)                  \
+    case VIRTIO_GPU_CMD_##cmd:                        \
+        g_virtio_gpu_backend.fn(vgpu, vq_desc, plen); \
+        break;
+#endif
+
+extern const struct virtio_gpu_cmd_backend g_virtio_gpu_backend;
+static virtio_gpu_data_t virtio_gpu_data;
+
+void *virtio_gpu_mem_guest_to_host(virtio_gpu_state_t *vgpu,
+                                   uint32_t addr,
+                                   uint32_t size)
+{
+    if (addr >= RAM_SIZE || size > RAM_SIZE || addr + size > RAM_SIZE) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): guest address 0x%x size 0x%x out of bounds\n",
+                __func__, addr, size);
+        return NULL;
+    }
+    return (void *) ((uintptr_t) vgpu->ram + addr);
+}
+
+void virtio_gpu_set_fail(virtio_gpu_state_t *vgpu)
+{
+    vgpu->Status |= VIRTIO_STATUS__DEVICE_NEEDS_RESET;
+    if (vgpu->Status & VIRTIO_STATUS__DRIVER_OK)
+        vgpu->InterruptStatus |= VIRTIO_INT__CONF_CHANGE;
+}
+
+void *virtio_gpu_get_request(virtio_gpu_state_t *vgpu,
+                             struct virtq_desc *vq_desc,
+                             size_t request_size)
+{
+    if ((vq_desc[0].flags & VIRTIO_DESC_F_WRITE) ||
+        vq_desc[0].len < request_size || request_size > UINT32_MAX)
+        return NULL;
+
+    return virtio_gpu_mem_guest_to_host(vgpu, vq_desc[0].addr,
+                                        (uint32_t) request_size);
+}
+
+int virtio_gpu_get_response_desc(struct virtq_desc *vq_desc,
+                                 int max_desc,
+                                 size_t response_size)
+{
+    if (response_size > UINT32_MAX)
+        return -1;
+
+    /* This helper works with the current fixed-shape descriptor parser:
+     * 'vq_desc[0]' is the request, optional command data follows, and the
+     * first writable descriptor is the response buffer. A writable descriptor
+     * that is too small therefore means the expected response buffer is
+     * malformed; this helper does not skip it and search for a later writable
+     * descriptor.
+     *
+     * TODO: Support generic descriptor-chain parsing.
+     */
+    for (int i = 1; i < max_desc; i++) {
+        if (!(vq_desc[i].flags & VIRTIO_DESC_F_WRITE))
+            continue;
+
+        if (vq_desc[i].len < response_size)
+            return -1;
+
+        return i;
+    }
+
+    return -1;
+}
+
+uint32_t virtio_gpu_write_ctrl_response(
+    virtio_gpu_state_t *vgpu,
+    const struct virtio_gpu_ctrl_hdr *request,
+    const struct virtq_desc *response_desc,
+    uint32_t type)
+{
+    if (response_desc->len < sizeof(struct virtio_gpu_ctrl_hdr))
+        return 0;
+
+    struct virtio_gpu_ctrl_hdr *response = virtio_gpu_mem_guest_to_host(
+        vgpu, response_desc->addr, sizeof(struct virtio_gpu_ctrl_hdr));
+    if (!response)
+        return 0;
+
+    memset(response, 0, sizeof(*response));
+    response->type = type;
+
+    if (request->flags & VIRTIO_GPU_FLAG_FENCE) {
+        response->flags = VIRTIO_GPU_FLAG_FENCE;
+        response->fence_id = request->fence_id;
+    }
+
+    return sizeof(*response);
+}
+
+/* 'virtio_gpu' protocol handlers */
+void virtio_gpu_get_display_info_handler(virtio_gpu_state_t *vgpu,
+                                         struct virtq_desc *vq_desc,
+                                         uint32_t *plen)
+{
+    struct virtio_gpu_ctrl_hdr *request = virtio_gpu_get_request(
+        vgpu, vq_desc, sizeof(struct virtio_gpu_ctrl_hdr));
+    if (!request) {
+        virtio_gpu_set_fail(vgpu);
+        *plen = 0;
+        return;
+    }
+
+    int resp_idx = virtio_gpu_get_response_desc(
+        vq_desc, VIRTIO_GPU_MAX_DESC, sizeof(struct virtio_gpu_resp_disp_info));
+    if (resp_idx < 0) {
+        *plen = 0;
+        return;
+    }
+
+    struct virtio_gpu_resp_disp_info *response = virtio_gpu_mem_guest_to_host(
+        vgpu, vq_desc[resp_idx].addr, sizeof(struct virtio_gpu_resp_disp_info));
+    if (!response) {
+        *plen = 0;
+        return;
+    }
+
+    memset(response, 0, sizeof(*response));
+    response->hdr.type = VIRTIO_GPU_RESP_OK_DISPLAY_INFO;
+
+    /* 'GET_DISPLAY_INFO' exposes scanouts as the 'pmodes[]' array, so the array
+     * index is the guest-visible 'scanout_id' used by later requests such as
+     * 'SET_SCANOUT' and 'GET_EDID'.
+     *
+     * The spec describes 'pmodes[]' as per-scanout information but does not
+     * spell out this mapping as a separate rule. semu follows the implicit
+     * model where 'pmodes[i]' describes scanout ID 'i' because later requests
+     * only carry a 'scanout_id', and Linux does the same when it copies
+     * 'resp->pmodes[i]' into 'outputs[i]' and later sends 'output->index' in
+     * 'SET_SCANOUT'. See 'virtgpu_vq.c' and 'virtgpu_display.c' for more
+     * details.
+     */
+    int scanout_num = PRIV(vgpu)->num_scanouts;
+    for (int i = 0; i < scanout_num; i++) {
+        response->pmodes[i].r.width = PRIV(vgpu)->scanouts[i].width;
+        response->pmodes[i].r.height = PRIV(vgpu)->scanouts[i].height;
+        response->pmodes[i].enabled = PRIV(vgpu)->scanouts[i].enabled;
+    }
+
+    *plen = sizeof(*response);
+    if (request->flags & VIRTIO_GPU_FLAG_FENCE) {
+        response->hdr.flags = VIRTIO_GPU_FLAG_FENCE;
+        response->hdr.fence_id = request->fence_id;
+    }
+}
+
+static uint8_t virtio_gpu_generate_edid_checksum(uint8_t *edid, size_t size)
+{
+    /* Check EDID 1.4 Section 3.11, Table 3.40 notes 2 and 3: byte 7Fh must
+     * make the modulo-256 sum of all 128 base EDID bytes equal 00h.
+     */
+    uint8_t sum = 0;
+
+    for (size_t i = 0; i < size; i++)
+        sum += edid[i];
+
+    return 0x100 - sum;
+}
+
+static uint16_t virtio_gpu_edid_pixels_to_mm(uint32_t pixels)
+{
+    /* Check EDID 1.4 Sections 3.6.2 and 3.10.2: base screen size is stored in
+     * centimeters, while detailed timing image size is stored in millimeters.
+     * Estimate virtual display size at 100 DPI.
+     */
+    uint32_t mm = ((uint64_t) pixels * 254U + 500U) / 1000U;
+
+    if (mm == 0)
+        mm = 1;
+    if (mm > 4095)
+        mm = 4095;
+
+    return mm;
+}
+
+static uint8_t virtio_gpu_edid_mm_to_cm(uint16_t mm)
+{
+    /* Check EDID 1.4 Section 3.6.2: base screen size fields are centimeters. */
+    uint32_t cm = (mm + 5U) / 10U;
+
+    if (cm == 0)
+        cm = 1;
+    if (cm > 255)
+        cm = 255;
+
+    return cm;
+}
+
+static void virtio_gpu_edid_set_srgb_chromaticity(uint8_t *edid)
+{
+    /* Check EDID 1.4 Section 3.7: sRGB chromaticity coordinates in EDID
+     * 10-bit fixed-point form, value = round(coordinate * 1024). The white
+     * point is D65.
+     */
+    const uint16_t red_x = 655;   /* round(0.640 * 1024) */
+    const uint16_t red_y = 338;   /* round(0.330 * 1024) */
+    const uint16_t green_x = 307; /* round(0.300 * 1024) */
+    const uint16_t green_y = 614; /* round(0.600 * 1024) */
+    const uint16_t blue_x = 154;  /* round(0.150 * 1024) */
+    const uint16_t blue_y = 61;   /* round(0.060 * 1024) */
+    const uint16_t white_x = 320; /* round(0.313 * 1024) */
+    const uint16_t white_y = 337; /* round(0.329 * 1024) */
+
+    edid[25] = ((red_x & 0x3) << 6) | ((red_y & 0x3) << 4) |
+               ((green_x & 0x3) << 2) | (green_y & 0x3);
+    edid[26] = ((blue_x & 0x3) << 6) | ((blue_y & 0x3) << 4) |
+               ((white_x & 0x3) << 2) | (white_y & 0x3);
+    edid[27] = red_x >> 2;
+    edid[28] = red_y >> 2;
+    edid[29] = green_x >> 2;
+    edid[30] = green_y >> 2;
+    edid[31] = blue_x >> 2;
+    edid[32] = blue_y >> 2;
+    edid[33] = white_x >> 2;
+    edid[34] = white_y >> 2;
+}
+
+static void virtio_gpu_edid_set_detailed_timing(uint8_t *desc,
+                                                uint32_t width,
+                                                uint32_t height,
+                                                uint16_t width_mm,
+                                                uint16_t height_mm)
+{
+    /* Check EDID 1.4 Section 3.10.2: detailed timing descriptor layout. */
+    uint32_t h_blank;           /* Horizontal blanking pixels. */
+    uint32_t h_front;           /* Horizontal front porch pixels. */
+    uint32_t h_sync;            /* Horizontal sync pulse width. */
+    uint32_t v_blank;           /* Vertical blanking lines. */
+    uint32_t v_front;           /* Vertical front porch lines. */
+    uint32_t v_sync;            /* Vertical sync pulse width. */
+    uint32_t pixel_clock_10khz; /* Pixel clock in 10 kHz units. */
+
+    if (width == DMT_BASE_WIDTH && height == DMT_BASE_HEIGHT) {
+        /* VESA DMT 1024x768@60Hz, also advertised in the base EDID established
+         * timings field. EDID stores pixel clock in 10 kHz units, so 6500
+         * means 65.00 MHz.
+         */
+        pixel_clock_10khz = DMT_BASE_PIXEL_CLOCK_10KHZ;
+        h_blank = DMT_BASE_H_BLANK;
+        h_front = DMT_BASE_H_FRONT;
+        h_sync = DMT_BASE_H_SYNC;
+        v_blank = DMT_BASE_V_BLANK;
+        v_front = DMT_BASE_V_FRONT;
+        v_sync = DMT_BASE_V_SYNC;
+    } else {
+        /* Fallback only for future multi-mode or non-default scanouts. The
+         * current machine registers one 1024x768 scanout, so this path is not
+         * reachable in the default build. Scale porch/sync proportions from
+         * the VESA DMT 1024x768@60Hz timing instead of inventing ad hoc
+         * ratios.
+         */
+        h_blank = ((uint64_t) width * DMT_BASE_H_BLANK + DMT_BASE_WIDTH / 2U) /
+                  DMT_BASE_WIDTH;
+        h_front = ((uint64_t) width * DMT_BASE_H_FRONT + DMT_BASE_WIDTH / 2U) /
+                  DMT_BASE_WIDTH;
+        h_sync = ((uint64_t) width * DMT_BASE_H_SYNC + DMT_BASE_WIDTH / 2U) /
+                 DMT_BASE_WIDTH;
+        if (h_front == 0)
+            h_front = 1;
+        if (h_sync == 0)
+            h_sync = 1;
+        if (h_blank <= h_front + h_sync) {
+            /* Keep front porch and sync pulse inside the blanking interval so
+             * the remaining pixels form the back porch.
+             */
+            h_blank = h_front + h_sync + 1U;
+        }
+
+        v_blank =
+            ((uint64_t) height * DMT_BASE_V_BLANK + DMT_BASE_HEIGHT / 2U) /
+            DMT_BASE_HEIGHT;
+        v_front =
+            ((uint64_t) height * DMT_BASE_V_FRONT + DMT_BASE_HEIGHT / 2U) /
+            DMT_BASE_HEIGHT;
+        v_sync = ((uint64_t) height * DMT_BASE_V_SYNC + DMT_BASE_HEIGHT / 2U) /
+                 DMT_BASE_HEIGHT;
+        if (v_front == 0)
+            v_front = 1;
+        if (v_sync == 0)
+            v_sync = 1;
+        if (v_blank <= v_front + v_sync)
+            v_blank = v_front + v_sync + 1U;
+
+        /* Pixel clock = refresh rate * horizontal total * vertical total.
+         * Divide by 10000 because the descriptor stores the clock in 10 kHz
+         * units. The +5000 rounds to the nearest 10 kHz.
+         */
+        pixel_clock_10khz = (60U * ((uint64_t) width + h_blank) *
+                                 ((uint64_t) height + v_blank) +
+                             5000U) /
+                            10000U;
+        if (pixel_clock_10khz > 0xffffU)
+            pixel_clock_10khz = 0xffffU;
+    }
+
+    /* Clamp fields to the bit widths defined by Table 3.21:
+     * active/blanking/image-size fields are 12-bit, horizontal sync fields are
+     * 10-bit, and vertical sync fields are 6-bit.
+     */
+    DMT_BOUND_FIELD(width, 4095U);
+    DMT_BOUND_FIELD(height, 4095U);
+    DMT_BOUND_FIELD(h_blank, 4095U);
+    DMT_BOUND_FIELD(h_front, 1023U);
+    DMT_BOUND_FIELD(h_sync, 1023U);
+    DMT_BOUND_FIELD(v_blank, 4095U);
+    DMT_BOUND_FIELD(v_front, 63U);
+    DMT_BOUND_FIELD(v_sync, 63U);
+
+    /* Bytes 0-1: pixel clock, little-endian, in 10 kHz units. */
+    desc[0] = pixel_clock_10khz & 0xff;
+    desc[1] = (pixel_clock_10khz >> 8) & 0xff;
+
+    /* Bytes 2-4: horizontal active and blanking, each split as low 8 bits plus
+     * high 4 bits packed into byte 4.
+     */
+    desc[2] = width & 0xff;
+    desc[3] = h_blank & 0xff;
+    desc[4] = ((width >> 8) << 4) | (h_blank >> 8);
+
+    /* Bytes 5-7: vertical active and blanking, using the same 12-bit packing
+     * pattern as the horizontal fields.
+     */
+    desc[5] = height & 0xff;
+    desc[6] = v_blank & 0xff;
+    desc[7] = ((height >> 8) << 4) | (v_blank >> 8);
+
+    /* Bytes 8-11: sync offsets and pulse widths. Horizontal fields are 10-bit;
+     * vertical fields are 6-bit and share byte 10 for their low nibbles.
+     */
+    desc[8] = h_front & 0xff;
+    desc[9] = h_sync & 0xff;
+    desc[10] = ((v_front & 0xf) << 4) | (v_sync & 0xf);
+    desc[11] = ((h_front >> 8) << 6) | ((h_sync >> 8) << 4) |
+               ((v_front >> 4) << 2) | (v_sync >> 4);
+
+    /* Bytes 12-14: displayed image size in millimeters, again as two 12-bit
+     * fields packed as low 8 bits plus high 4 bits.
+     */
+    desc[12] = width_mm & 0xff;
+    desc[13] = height_mm & 0xff;
+    desc[14] = ((width_mm >> 8) << 4) | (height_mm >> 8);
+
+    /* Bytes 15-16: horizontal and vertical border, unused for this display. */
+    desc[15] = 0;
+    desc[16] = 0;
+
+    /* Byte 17: non-interlaced, no stereo, digital separate sync, negative H/V
+     * sync polarity.
+     */
+    desc[17] = 0x18;
+}
+
+/* EDID data follows "VESA ENHANCED EXTENDED DISPLAY IDENTIFICATION DATA
+ * STANDARD" (defines EDID Structure Version 1, Revision 4).
+ */
+static void virtio_gpu_generate_edid(uint8_t *edid,
+                                     uint32_t width,
+                                     uint32_t height)
+{
+    /* Check EDID 1.4 Section 3.1: base EDID block layout. */
+    if (width == 0)
+        width = SCREEN_WIDTH;
+    if (height == 0)
+        height = SCREEN_HEIGHT;
+
+    uint16_t width_mm = virtio_gpu_edid_pixels_to_mm(width);
+    uint16_t height_mm = virtio_gpu_edid_pixels_to_mm(height);
+
+    memset(edid, 0, EDID_BLOCK_SIZE);
+
+    /* Check EDID 1.4 Section 3.3: EDID header. */
+    edid[0] = 0x00;
+    edid[1] = 0xff;
+    edid[2] = 0xff;
+    edid[3] = 0xff;
+    edid[4] = 0xff;
+    edid[5] = 0xff;
+    edid[6] = 0xff;
+    edid[7] = 0x00;
+
+    /* Check EDID 1.4 Section 3.4.1: ID Manufacturer Name, stored as a
+     * 3-character PNPID in 5-bit compressed ASCII.
+     */
+    char manufacture[3] = {'T', 'W', 'N'};
+
+    /* Vendor ID uses 2 bytes to store 3 characters, where 'A' starts as 1 */
+    uint16_t vendor_id = ((((manufacture[0] - '@') & 0b11111) << 10) |
+                          (((manufacture[1] - '@') & 0b11111) << 5) |
+                          (((manufacture[2] - '@') & 0b11111) << 0));
+    /* Convert vendor ID to big-endian order */
+    edid[8] = vendor_id >> 8;
+    edid[9] = vendor_id & 0xff;
+
+    /* Check EDID 1.4 Sections 3.4.2 and 3.4.3: product code and serial
+     * number, all zeros if unused.
+     */
+    memset(&edid[10], 0, sizeof(uint16_t) + sizeof(uint32_t));
+
+    /* Check EDID 1.4 Section 3.4.4: week of manufacture, 0 if unused. */
+    edid[16] = 0;
+    /* Check EDID 1.4 Section 3.4.4: year of manufacture starts from 1990. */
+    edid[17] = 2023 - 1990;
+
+    /* Check EDID 1.4 Section 3.5: version 1, revision 4. */
+    edid[18] = 1; /* Version number */
+    edid[19] = 4; /* Revision number */
+
+    /* Check EDID 1.4 Section 3.6.1: video input definition. */
+    uint8_t signal_interface = 0b1 << 7;  /* digital */
+    uint8_t color_bit_depth = 0b010 << 4; /* 8 bits per primary color */
+    uint8_t interface_type = 0b101;       /* DisplayPort is supported */
+    edid[20] = signal_interface | color_bit_depth | interface_type;
+
+    /* Check EDID 1.4 Section 3.6.2: screen size or aspect ratio. */
+    edid[21] = virtio_gpu_edid_mm_to_cm(width_mm);
+    edid[22] = virtio_gpu_edid_mm_to_cm(height_mm);
+
+    /* Check EDID 1.4 Section 3.6.3: gamma value. */
+    edid[23] = 120; /* 2.20 */
+
+    /* Check EDID 1.4 Section 3.6.4: feature support. */
+    uint8_t power_management = 0 << 4; /* standby, suspend and active-off
+                                        * modes are not supported
+                                        */
+    uint8_t color_type = 0 << 3;       /* RGB 4:4:4 */
+    uint8_t other_flags = 0b110;       /* [2]: sRGB as default color space
+                                        * [1]: Preferred timing mode with native
+                                        * format       [0]: Non-continuous frequency
+                                        */
+    edid[24] = power_management | color_type | other_flags;
+
+    virtio_gpu_edid_set_srgb_chromaticity(edid);
+
+    /* Check EDID 1.4 Section 3.8: established timings. These are the default
+     * timings defined by the VESA. Each bit represents 1 configuration. For
+     * now, we enable the timing configurations of 1024x768@60Hz only.
+     */
+    edid[35] = 0b00000000;
+    edid[36] = (width == DMT_BASE_WIDTH && height == DMT_BASE_HEIGHT)
+                   ? 0b00001000
+                   : 0b00000000;
+    edid[37] = 0b00000000;
+
+    /* Check EDID 1.4 Section 3.9: standard timings. The 16 bytes from
+     * edid[38] to edid[53] hold eight 2-byte timing identifiers. Mark every
+     * standard timing slot unused.
+     */
+    memset(&edid[38], 0x01, 16);
+
+    /* Check EDID 1.4 Sections 3.10.1 and 3.10.2: first detailed timing
+     * descriptor is the preferred timing mode, here the native scanout mode at
+     * 60Hz.
+     */
+    virtio_gpu_edid_set_detailed_timing(&edid[54], width, height, width_mm,
+                                        height_mm);
+
+    /* Check EDID 1.4 Sections 3.10 and 3.10.3.11: mark remaining 18-byte
+     * descriptor slots unused with Dummy Descriptor tag 10h.
+     */
+    for (size_t desc = 72; desc < 126; desc += 18)
+        edid[desc + 3] = 0x10;
+
+    /* Check EDID 1.4 Section 3.11: extension block count. */
+    edid[126] = 0; /* No other extension blocks are defined */
+
+    /* Check EDID 1.4 Section 3.11: checksum of the base EDID block. */
+    edid[EDID_BLOCK_SIZE - 1U] =
+        virtio_gpu_generate_edid_checksum(edid, EDID_BLOCK_SIZE - 1U);
+}
+
+void virtio_gpu_get_edid_handler(virtio_gpu_state_t *vgpu,
+                                 struct virtq_desc *vq_desc,
+                                 uint32_t *plen)
+{
+    struct virtio_gpu_cmd_get_edid *request = virtio_gpu_get_request(
+        vgpu, vq_desc, sizeof(struct virtio_gpu_cmd_get_edid));
+    if (!request) {
+        virtio_gpu_set_fail(vgpu);
+        *plen = 0;
+        return;
+    }
+
+    int resp_idx = virtio_gpu_get_response_desc(
+        vq_desc, VIRTIO_GPU_MAX_DESC, sizeof(struct virtio_gpu_resp_edid));
+    if (resp_idx < 0) {
+        *plen = 0;
+        return;
+    }
+
+    if (request->scanout >= PRIV(vgpu)->num_scanouts ||
+        !PRIV(vgpu)->scanouts[request->scanout].enabled) {
+        fprintf(stderr, VIRTIO_GPU_LOG_PREFIX "%s(): invalid scanout id %u\n",
+                __func__, request->scanout);
+        *plen = virtio_gpu_write_ctrl_response(
+            vgpu, &request->hdr, &vq_desc[resp_idx],
+            VIRTIO_GPU_RESP_ERR_INVALID_SCANOUT_ID);
+        return;
+    }
+
+    const struct virtio_gpu_scanout_info *scanout =
+        &PRIV(vgpu)->scanouts[request->scanout];
+
+    struct virtio_gpu_resp_edid *response = virtio_gpu_mem_guest_to_host(
+        vgpu, vq_desc[resp_idx].addr, sizeof(struct virtio_gpu_resp_edid));
+    if (!response) {
+        *plen = 0;
+        return;
+    }
+
+    memset(response, 0, sizeof(*response));
+    response->hdr.type = VIRTIO_GPU_RESP_OK_EDID;
+    response->size = EDID_BLOCK_SIZE; /* One base EDID block. */
+    virtio_gpu_generate_edid((uint8_t *) response->edid, scanout->width,
+                             scanout->height);
+
+    /* return write length */
+    *plen = sizeof(*response);
+    if (request->hdr.flags & VIRTIO_GPU_FLAG_FENCE) {
+        response->hdr.flags = VIRTIO_GPU_FLAG_FENCE;
+        response->hdr.fence_id = request->hdr.fence_id;
+    }
+}
+
+void virtio_gpu_cmd_undefined_handler(virtio_gpu_state_t *vgpu,
+                                      struct virtq_desc *vq_desc,
+                                      uint32_t *plen)
+{
+    struct virtio_gpu_ctrl_hdr *header = virtio_gpu_get_request(
+        vgpu, vq_desc, sizeof(struct virtio_gpu_ctrl_hdr));
+    if (!header) {
+        virtio_gpu_set_fail(vgpu);
+        *plen = 0;
+        return;
+    }
+
+    fprintf(stderr,
+            VIRTIO_GPU_LOG_PREFIX
+            "%s(): unsupported VirtIO-GPU command type "
+            "%u\n",
+            __func__, header->type);
+
+    virtio_gpu_set_fail(vgpu);
+    *plen = 0;
+}
+
+static int virtio_gpu_desc_handler(virtio_gpu_state_t *vgpu,
+                                   const virtio_gpu_queue_t *queue,
+                                   int queue_index,
+                                   uint32_t desc_idx,
+                                   uint32_t *plen)
+{
+    struct virtq_desc vq_desc[VIRTIO_GPU_MAX_DESC] = {0};
+
+    /* Collect descriptors */
+    for (int i = 0; i < VIRTIO_GPU_MAX_DESC; i++) {
+        if (desc_idx >= queue->QueueNum) {
+            virtio_gpu_set_fail(vgpu);
+            return -1;
+        }
+
+        /* The size of 'struct virtq_desc' is 4 words. */
+        uint32_t desc_offset = queue->QueueDesc + desc_idx * 4;
+        uint32_t *desc = &vgpu->ram[desc_offset];
+
+        /* The guest is riscv32, so the upper 32 bits of every descriptor
+         * address must be zero. Reject any descriptor whose 'addr_high' is set
+         * before later code truncates it via 'virtio_gpu_mem_guest_to_host()',
+         * which would otherwise silently mask a guest bug.
+         */
+        if (desc[1] != 0) {
+            virtio_gpu_set_fail(vgpu);
+            return -1;
+        }
+
+        /* Retrieve the fields of the current descriptor. */
+        vq_desc[i].addr = desc[0];
+        vq_desc[i].len = desc[2];
+        vq_desc[i].flags = desc[3];
+        desc_idx = desc[3] >> 16; /* 'vq_desc[desc_cnt].next' */
+
+        /* Leave the loop if 'VIRTIO_DESC_F_NEXT' is not set. */
+        if (!(vq_desc[i].flags & VIRTIO_DESC_F_NEXT))
+            break;
+    }
+
+    struct virtio_gpu_ctrl_hdr *header = virtio_gpu_get_request(
+        vgpu, vq_desc, sizeof(struct virtio_gpu_ctrl_hdr));
+    if (!header) {
+        virtio_gpu_set_fail(vgpu);
+        return -1;
+    }
+
+    bool is_cursor_cmd = header->type == VIRTIO_GPU_CMD_UPDATE_CURSOR ||
+                         header->type == VIRTIO_GPU_CMD_MOVE_CURSOR;
+    if ((queue_index == VIRTIO_GPU_CONTROLQ && is_cursor_cmd) ||
+        (queue_index == VIRTIO_GPU_CURSORQ && !is_cursor_cmd)) {
+        virtio_gpu_set_fail(vgpu);
+        return -1;
+    }
+
+    /* Keep the fixed 3-descriptor contract explicit. Longer chains need
+     * multi-SG parsing, so reject them before command dispatch.
+     *
+     * TODO: Support generic descriptor-chain parsing.
+     */
+    if (vq_desc[VIRTIO_GPU_MAX_DESC - 1].flags & VIRTIO_DESC_F_NEXT) {
+        int resp_idx = virtio_gpu_get_response_desc(
+            vq_desc, VIRTIO_GPU_MAX_DESC, sizeof(struct virtio_gpu_ctrl_hdr));
+        if (resp_idx < 0) {
+            fprintf(stderr,
+                    VIRTIO_GPU_LOG_PREFIX
+                    "%s(): descriptor chain exceeds supported length and has "
+                    "no usable response descriptor\n",
+                    __func__);
+            virtio_gpu_set_fail(vgpu);
+            return -1;
+        }
+
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): descriptor chain exceeds supported length\n",
+                __func__);
+        *plen = virtio_gpu_write_ctrl_response(vgpu, header, &vq_desc[resp_idx],
+                                               VIRTIO_GPU_RESP_ERR_UNSPEC);
+        if (!*plen) {
+            virtio_gpu_set_fail(vgpu);
+            return -1;
+        }
+
+        return 0;
+    }
+
+    /* Process the command */
+    switch (header->type) {
+        /* 2D commands */
+        VIRTIO_GPU_CMD_CASE(GET_DISPLAY_INFO, get_display_info)
+        VIRTIO_GPU_CMD_CASE(RESOURCE_CREATE_2D, resource_create_2d)
+        VIRTIO_GPU_CMD_CASE(RESOURCE_UNREF, resource_unref)
+        VIRTIO_GPU_CMD_CASE(SET_SCANOUT, set_scanout)
+        VIRTIO_GPU_CMD_CASE(RESOURCE_FLUSH, resource_flush)
+        VIRTIO_GPU_CMD_CASE(TRANSFER_TO_HOST_2D, transfer_to_host_2d)
+        VIRTIO_GPU_CMD_CASE(RESOURCE_ATTACH_BACKING, resource_attach_backing)
+        VIRTIO_GPU_CMD_CASE(RESOURCE_DETACH_BACKING, resource_detach_backing)
+        VIRTIO_GPU_CMD_CASE(GET_CAPSET_INFO, get_capset_info)
+        VIRTIO_GPU_CMD_CASE(GET_CAPSET, get_capset)
+        VIRTIO_GPU_CMD_CASE(GET_EDID, get_edid)
+        VIRTIO_GPU_CMD_CASE(RESOURCE_ASSIGN_UUID, resource_assign_uuid)
+        VIRTIO_GPU_CMD_CASE(RESOURCE_CREATE_BLOB, resource_create_blob)
+        VIRTIO_GPU_CMD_CASE(SET_SCANOUT_BLOB, set_scanout_blob)
+        /* 3D commands */
+        VIRTIO_GPU_CMD_CASE(CTX_CREATE, ctx_create)
+        VIRTIO_GPU_CMD_CASE(CTX_DESTROY, ctx_destroy)
+        VIRTIO_GPU_CMD_CASE(CTX_ATTACH_RESOURCE, ctx_attach_resource)
+        VIRTIO_GPU_CMD_CASE(CTX_DETACH_RESOURCE, ctx_detach_resource)
+        VIRTIO_GPU_CMD_CASE(RESOURCE_CREATE_3D, resource_create_3d)
+        VIRTIO_GPU_CMD_CASE(TRANSFER_TO_HOST_3D, transfer_to_host_3d)
+        VIRTIO_GPU_CMD_CASE(TRANSFER_FROM_HOST_3D, transfer_from_host_3d)
+        VIRTIO_GPU_CMD_CASE(SUBMIT_3D, submit_3d)
+        VIRTIO_GPU_CMD_CASE(RESOURCE_MAP_BLOB, resource_map_blob)
+        VIRTIO_GPU_CMD_CASE(RESOURCE_UNMAP_BLOB, resource_unmap_blob)
+        VIRTIO_GPU_CMD_CASE(UPDATE_CURSOR, update_cursor)
+        VIRTIO_GPU_CMD_CASE(MOVE_CURSOR, move_cursor)
+    default:
+        virtio_gpu_cmd_undefined_handler(vgpu, vq_desc, plen);
+        return -1;
+    }
+
+    return 0;
+}
+
+static void virtio_gpu_queue_notify_handler(virtio_gpu_state_t *vgpu, int index)
+{
+    uint32_t *ram = vgpu->ram;
+    virtio_gpu_queue_t *queue = &vgpu->queues[index];
+    if (vgpu->Status & VIRTIO_STATUS__DEVICE_NEEDS_RESET)
+        return;
+
+    if (!((vgpu->Status & VIRTIO_STATUS__DRIVER_OK) && queue->ready))
+        return virtio_gpu_set_fail(vgpu);
+
+    /* Check for new buffers */
+    uint16_t new_avail = ram[queue->QueueAvail] >> 16;
+    uint16_t avail_delta = (uint16_t) (new_avail - queue->last_avail);
+    if (avail_delta > (uint16_t) queue->QueueNum) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): queue %d avail index advanced by %u entries, exceeds "
+                "queue size %u\n",
+                __func__, index, (unsigned) avail_delta,
+                (unsigned) queue->QueueNum);
+        virtio_gpu_set_fail(vgpu);
+        return;
+    }
+
+    if (queue->last_avail == new_avail)
+        return;
+
+    /* Process them */
+    uint16_t new_used =
+        ram[queue->QueueUsed] >> 16; /* 'virtq_used.idx' (le16) */
+    while (queue->last_avail != new_avail) {
+        /* Obtain the index in the ring buffer */
+        uint16_t queue_idx = queue->last_avail % queue->QueueNum;
+
+        /* Since each buffer index occupies 2 bytes but the memory is aligned
+         * with 4 bytes, and the first element of the available queue is stored
+         * at 'ram[queue->QueueAvail + 1]', to acquire the buffer index, it
+         * requires the following array index calculation and bit shifting.
+         * Check also 'struct virtq_avail' in the spec.
+         */
+        uint16_t buffer_idx = ram[queue->QueueAvail + 1 + queue_idx / 2] >>
+                              (16 * (queue_idx % 2));
+
+        /* Consume request from the available queue and process the data in the
+         * descriptor list.
+         */
+        uint32_t len = 0;
+        int result =
+            virtio_gpu_desc_handler(vgpu, queue, index, buffer_idx, &len);
+        if (result != 0)
+            return;
+
+        /* Write used element information ('struct virtq_used_elem') to the used
+         * queue
+         */
+        uint32_t vq_used_addr =
+            queue->QueueUsed + 1 + (new_used % queue->QueueNum) * 2;
+        ram[vq_used_addr] = buffer_idx; /* 'virtq_used_elem.id'  (le32) */
+        ram[vq_used_addr + 1] = len;    /* 'virtq_used_elem.len' (le32) */
+        queue->last_avail++;
+        new_used++;
+    }
+
+    /* Update 'virtq_used.idx' (keep 'virtq_used.flags' in low 16 bits). */
+    ram[queue->QueueUsed] &= MASK(16); /* clear high 16 bits (idx) */
+    ram[queue->QueueUsed] |= ((uint32_t) new_used) << 16; /* set idx */
+
+    /* Send interrupt, unless 'VIRTQ_AVAIL_F_NO_INTERRUPT' is set. */
+    if (!(ram[queue->QueueAvail] & 1))
+        vgpu->InterruptStatus |= VIRTIO_INT__USED_RING;
+}
+
+static inline uint32_t virtio_gpu_preprocess(virtio_gpu_state_t *vgpu,
+                                             uint32_t addr)
+{
+    if ((addr >= RAM_SIZE) || (addr & 0b11))
+        return virtio_gpu_set_fail(vgpu), 0;
+
+    return addr >> 2;
+}
+
+static void virtio_gpu_update_status(virtio_gpu_state_t *vgpu, uint32_t status)
+{
+    vgpu->Status |= status;
+    if (status)
+        return;
+
+    if (g_virtio_gpu_backend.reset)
+        g_virtio_gpu_backend.reset(vgpu);
+
+    /* Reset VirtIO device state (feature negotiation, queue descriptors,
+     * avail/used rings, status and interrupt registers). 'ram' and 'priv' are
+     * infrastructure pointers provided by the host, not device state, so
+     * they are saved and restored across the 'memset()'.
+     *
+     * 'vgpu->priv' ('virtio_gpu_data_t') is intentionally NOT reset here.
+     * It holds host-configured scanout info (display dimensions / enabled
+     * flags) set up before the guest driver probes the device. The guest
+     * re-queries this via 'CMD_GET_DISPLAY_INFO' after each reset, so it must
+     * survive. Renderer-specific bindings and resources live behind the
+     * backend hook and are reset before the generic device state is cleared.
+     */
+    uint32_t *ram = vgpu->ram;
+    void *priv = vgpu->priv;
+    memset(vgpu, 0, sizeof(*vgpu));
+    vgpu->ram = ram;
+    vgpu->priv = priv;
+}
+
+static bool virtio_gpu_reg_read(virtio_gpu_state_t *vgpu,
+                                uint32_t addr,
+                                uint32_t *value)
+{
+#define _(reg) VIRTIO_##reg
+    switch (addr) {
+    case _(MagicValue):
+        *value = 0x74726976;
+        return true;
+    case _(Version):
+        *value = 2;
+        return true;
+    case _(DeviceID):
+        *value = 16;
+        return true;
+    case _(VendorID):
+        *value = VIRTIO_VENDOR_ID;
+        return true;
+    case _(DeviceFeatures):
+        /* TODO: Advertise virgl/3D and blob-resource feature bits after the
+         * backend supports their command and display paths.
+         */
+        *value = vgpu->DeviceFeaturesSel == 0
+                     ? VIRTIO_GPU_F_EDID
+                     : (vgpu->DeviceFeaturesSel == 1 ? VIRTIO_F_VERSION_1 : 0);
+        return true;
+    case _(QueueNumMax):
+        *value = VIRTIO_GPU_QUEUE_NUM_MAX;
+        return true;
+    case _(QueueReady):
+        *value = VIRTIO_GPU_QUEUE.ready ? 1 : 0;
+        return true;
+    case _(InterruptStatus):
+        *value = vgpu->InterruptStatus;
+        return true;
+    case _(Status):
+        *value = vgpu->Status;
+        return true;
+    case _(SHMLenLow):
+    case _(SHMLenHigh):
+        /* TODO: Implement shared-memory regions before advertising
+         * VIRTIO_GPU_F_RESOURCE_BLOB.
+         */
+        *value = -1;
+        return true;
+    case _(SHMBaseLow):
+    case _(SHMBaseHigh):
+        *value = 0;
+        return true;
+    case _(ConfigGeneration):
+        *value = 0;
+        return true;
+    default:
+        /* Unimplemented common registers, including write-only 'SHMSel',
+         * intentionally fault instead of returning placeholder values.
+         * TODO: Implement 'QueueReset' when advertising VIRTIO_F_RING_RESET.
+         */
+        if (!RANGE_CHECK(addr, _(Config), sizeof(struct virtio_gpu_config)))
+            return false;
+
+        /* Read configuration from the corresponding register */
+        uint32_t offset = (addr - _(Config)) << 2;
+        switch (offset) {
+        case offsetof(struct virtio_gpu_config, events_read): {
+            *value = 0; /* No event is implemented currently */
+            return true;
+        }
+        case offsetof(struct virtio_gpu_config, num_scanouts): {
+            *value = PRIV(vgpu)->num_scanouts;
+            return true;
+        }
+        case offsetof(struct virtio_gpu_config, num_capsets): {
+            /* TODO: Return virgl capsets after implementing the corresponding
+             * 3D command backend. Zero capsets keeps guests on the 2D path.
+             */
+            *value = 0;
+            return true;
+        }
+        default:
+            return false;
+        }
+    }
+#undef _
+}
+
+void virtio_gpu_read(hart_t *vm,
+                     virtio_gpu_state_t *vgpu,
+                     uint32_t addr,
+                     uint8_t width,
+                     uint32_t *value)
+{
+    /* The VGPU device exposes its MMIO registers as aligned 32-bit words
+     * only. It rejects byte and halfword accesses instead of emulating
+     * partial register reads.
+     */
+    switch (width) {
+    case RV_MEM_LW:
+        if (!virtio_gpu_reg_read(vgpu, addr >> 2, value))
+            vm_set_exception(vm, RV_EXC_LOAD_FAULT, vm->exc_val);
+        break;
+    case RV_MEM_LBU:
+    case RV_MEM_LB:
+    case RV_MEM_LHU:
+    case RV_MEM_LH:
+        vm_set_exception(vm, RV_EXC_LOAD_MISALIGN, vm->exc_val);
+        return;
+    default:
+        vm_set_exception(vm, RV_EXC_ILLEGAL_INSN, 0);
+        return;
+    }
+}
+
+/* After 'QueueReady' is set, 'QueueNum' and the ring address registers have
+ * already been validated and may be consumed by the device. Reject later
+ * writes to that virtqueue configuration instead of letting the guest change
+ * it under the running queue.
+ */
+static bool virtio_gpu_vq_config_after_ready(virtio_gpu_state_t *vgpu,
+                                             uint32_t addr)
+{
+    if (!VIRTIO_GPU_QUEUE.ready)
+        return false;
+
+#define _(reg) VIRTIO_##reg
+    switch (addr) {
+    case _(QueueNum):
+    case _(QueueDescLow):
+    case _(QueueDescHigh):
+    case _(QueueDriverLow):
+    case _(QueueDriverHigh):
+    case _(QueueDeviceLow):
+    case _(QueueDeviceHigh):
+        return true;
+    default:
+        return false;
+    }
+#undef _
+}
+
+static bool virtio_gpu_reg_write(virtio_gpu_state_t *vgpu,
+                                 uint32_t addr,
+                                 uint32_t value)
+{
+#define _(reg) VIRTIO_##reg
+    if (virtio_gpu_vq_config_after_ready(vgpu, addr)) {
+        virtio_gpu_set_fail(vgpu);
+        return true;
+    }
+
+    switch (addr) {
+    case _(DeviceFeaturesSel):
+        vgpu->DeviceFeaturesSel = value;
+        return true;
+    case _(DriverFeatures):
+        if (vgpu->DriverFeaturesSel == 0)
+            vgpu->DriverFeatures = value;
+        return true;
+    case _(DriverFeaturesSel):
+        vgpu->DriverFeaturesSel = value;
+        return true;
+    case _(QueueSel):
+        if (value < ARRAY_SIZE(vgpu->queues))
+            vgpu->QueueSel = value;
+        else
+            virtio_gpu_set_fail(vgpu);
+        return true;
+    case _(QueueNum):
+        if (value > 0 && value <= VIRTIO_GPU_QUEUE_NUM_MAX)
+            VIRTIO_GPU_QUEUE.QueueNum = value;
+        else
+            virtio_gpu_set_fail(vgpu);
+        return true;
+    case _(QueueReady):
+        VIRTIO_GPU_QUEUE.ready = value & 1;
+        if (value & 1) {
+            /* Validate that the full rings fit in guest RAM before allowing
+             * the queue to go live. 'virtio_gpu_preprocess()' only checked the
+             * base addresses. Here we verify the end of each ring region.
+             * All addresses are word indices (byte address >> 2).
+             *
+             * These sizes assume 'VIRTIO_F_EVENT_IDX' is not negotiated. We
+             * never advertise it (see 'DeviceFeatures'), so neither
+             * 'avail.used_event' nor 'used.avail_event' exist. If that flag is
+             * ever added, both end calculations need an extra word for the
+             * trailing '*_event' field.
+             */
+            uint32_t qnum = VIRTIO_GPU_QUEUE.QueueNum;
+            uint32_t ram_words = RAM_SIZE / sizeof(uint32_t);
+
+            /* Desc table: 'QueueNum' entries * 4 words each. */
+            uint32_t desc_end = VIRTIO_GPU_QUEUE.QueueDesc + qnum * 4;
+            /* Avail ring: one word for 'flags' + 'idx', then
+             * ceil('QueueNum' / 2) words for 16-bit descriptor indexes.
+             */
+            uint32_t avail_end =
+                VIRTIO_GPU_QUEUE.QueueAvail + 1 + (qnum + 1) / 2;
+            /* Used ring: one word for 'flags' + 'idx', then 'QueueNum'
+             * entries of 'struct virtq_used_elem' (2 words each).
+             */
+            uint32_t used_end = VIRTIO_GPU_QUEUE.QueueUsed + 1 + qnum * 2;
+
+            if (!qnum || desc_end > ram_words || avail_end > ram_words ||
+                used_end > ram_words) {
+                VIRTIO_GPU_QUEUE.ready = false;
+                virtio_gpu_set_fail(vgpu);
+                return true;
+            }
+            VIRTIO_GPU_QUEUE.last_avail =
+                vgpu->ram[VIRTIO_GPU_QUEUE.QueueAvail] >> 16;
+        }
+        return true;
+    case _(QueueDescLow):
+        VIRTIO_GPU_QUEUE.QueueDesc = virtio_gpu_preprocess(vgpu, value);
+        return true;
+    case _(QueueDescHigh):
+        if (value)
+            virtio_gpu_set_fail(vgpu);
+        return true;
+    case _(QueueDriverLow):
+        VIRTIO_GPU_QUEUE.QueueAvail = virtio_gpu_preprocess(vgpu, value);
+        return true;
+    case _(QueueDriverHigh):
+        if (value)
+            virtio_gpu_set_fail(vgpu);
+        return true;
+    case _(QueueDeviceLow):
+        VIRTIO_GPU_QUEUE.QueueUsed = virtio_gpu_preprocess(vgpu, value);
+        return true;
+    case _(QueueDeviceHigh):
+        if (value)
+            virtio_gpu_set_fail(vgpu);
+        return true;
+    case _(QueueNotify):
+        if (value < ARRAY_SIZE(vgpu->queues))
+            virtio_gpu_queue_notify_handler(vgpu, value);
+        else
+            virtio_gpu_set_fail(vgpu);
+        return true;
+    case _(InterruptACK):
+        vgpu->InterruptStatus &= ~value;
+        return true;
+    case _(Status):
+        virtio_gpu_update_status(vgpu, value);
+        return true;
+    case _(SHMSel):
+        /* No shared-memory regions are advertised, so the selector is accepted
+         * and ignored.
+         */
+        return true;
+    default:
+        /* Unsupported writes fault instead of updating unknown state.
+         * TODO: Implement 'QueueReset' when advertising VIRTIO_F_RING_RESET.
+         */
+        if (!RANGE_CHECK(addr, _(Config), sizeof(struct virtio_gpu_config)))
+            return false;
+
+        /* Write configuration to the corresponding register */
+        uint32_t offset = (addr - _(Config)) << 2;
+        switch (offset) {
+        case offsetof(struct virtio_gpu_config, events_clear): {
+            /* Ignored, no event is implemented currently */
+            return true;
+        }
+        default:
+            return false;
+        }
+    }
+#undef _
+}
+
+void virtio_gpu_write(hart_t *vm,
+                      virtio_gpu_state_t *vgpu,
+                      uint32_t addr,
+                      uint8_t width,
+                      uint32_t value)
+{
+    /* The VGPU device applies the same rule to writes: only aligned 32-bit
+     * stores are accepted for the MMIO register block, and narrower accesses
+     * fault.
+     */
+    switch (width) {
+    case RV_MEM_SW:
+        if (!virtio_gpu_reg_write(vgpu, addr >> 2, value))
+            vm_set_exception(vm, RV_EXC_STORE_FAULT, vm->exc_val);
+        break;
+    case RV_MEM_SB:
+    case RV_MEM_SH:
+        vm_set_exception(vm, RV_EXC_STORE_MISALIGN, vm->exc_val);
+        return;
+    default:
+        vm_set_exception(vm, RV_EXC_ILLEGAL_INSN, 0);
+        return;
+    }
+}
+
+void virtio_gpu_init(virtio_gpu_state_t *vgpu)
+{
+    static bool initialized = false;
+
+    if (initialized) {
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX
+                "%s(): only one virtio-gpu instance is supported\n",
+                __func__);
+        exit(EXIT_FAILURE);
+    }
+    initialized = true;
+
+    vgpu->priv = &virtio_gpu_data;
+}
+
+uint32_t virtio_gpu_register_scanout(virtio_gpu_state_t *vgpu,
+                                     uint32_t width,
+                                     uint32_t height)
+{
+    int scanout_num = PRIV(vgpu)->num_scanouts;
+    if (scanout_num >= VIRTIO_GPU_MAX_SCANOUTS) {
+        /* Registration is init-only today. Return an error instead if scanout
+         * creation becomes dynamic or guest-triggered.
+         */
+        fprintf(stderr,
+                VIRTIO_GPU_LOG_PREFIX "%s(): exceeded scanout maximum number\n",
+                __func__);
+        exit(EXIT_FAILURE);
+    }
+
+    PRIV(vgpu)->scanouts[scanout_num].width = width;
+    PRIV(vgpu)->scanouts[scanout_num].height = height;
+    PRIV(vgpu)->scanouts[scanout_num].enabled = 1;
+    PRIV(vgpu)->scanouts[scanout_num].primary_resource_id = 0;
+    PRIV(vgpu)->scanouts[scanout_num].cursor_resource_id = 0;
+    PRIV(vgpu)->scanouts[scanout_num].src_x = 0;
+    PRIV(vgpu)->scanouts[scanout_num].src_y = 0;
+    PRIV(vgpu)->scanouts[scanout_num].src_w = 0;
+    PRIV(vgpu)->scanouts[scanout_num].src_h = 0;
+
+    /* 'scanout_num' will match the guest-visible 'scanout_id'. See
+     * 'virtio_gpu_get_display_info_handler()' above for how that index is
+     * exposed to the guest and later reused in 'SET_SCANOUT'/'GET_EDID'.
+     */
+    PRIV(vgpu)->num_scanouts++;
+
+    return (uint32_t) scanout_num;
+}
diff --git a/virtio-gpu.h b/virtio-gpu.h
new file mode 100644
index 00000000..12cde627
--- /dev/null
+++ b/virtio-gpu.h
@@ -0,0 +1,388 @@
+#pragma once
+
+#if !SEMU_HAS(VIRTIOGPU)
+#error Only valid when Virtio-GPU is enabled.
+#endif
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "device.h"
+#include "virtio.h"
+
+#define VIRTIO_GPU_MAX_SCANOUTS 16
+#define VIRTIO_GPU_LOG_PREFIX "[SEMU VGPU] "
+#define VIRTIO_GPU_CMD_UNDEF virtio_gpu_cmd_undefined_handler
+#define VIRTIO_GPU_FLAG_FENCE (1 << 0)
+
+/* Maximum descriptor chain length accepted by 'virtio_gpu_desc_handler()'.
+ *
+ * semu follows the common Linux virtio-gpu control queue shape in
+ * 'virtio_gpu_queue_fenced_ctrl_buffer()': 'sgs[3]' holds 'vcmd' (request),
+ * optional 'vout' (command data, e.g. 'RESOURCE_ATTACH_BACKING' entries), and
+ * optional 'vresp' (response). The supported commands therefore fit in "request
+ * + one data segment + response".
+ *
+ * This is not a general virtio-gpu descriptor-chain limit. Linux allocates the
+ * backing-entry array in 'virtio_gpu_object_shmem_init()' with
+ * 'kvmalloc_objs()'. If that buffer falls back to 'vmalloc',
+ * 'virtio_gpu_queue_fenced_ctrl_buffer()' detects it with 'is_vmalloc_addr()'
+ * and 'vmalloc_to_sgt()' expands 'vout' into multiple scatter-gather entries.
+ *
+ * Supporting that path would require accepting a longer descriptor chain and
+ * auditing every handler that indexes 'vq_desc[]'. Longer chains are rejected.
+ * The current response-descriptor lookup is also part of this fixed-shape
+ * parser: it scans the zero-initialized 3-entry array, not an arbitrary
+ * guest-provided scatter-gather chain.
+ *
+ * TODO: Support generic descriptor-chain parsing.
+ */
+#define VIRTIO_GPU_MAX_DESC 3
+
+/* Core per-scanout metadata keyed by the guest-visible 'scanout_id'. This
+ * combines guest-visible display info ('width'/'height'/'enabled') with the
+ * current primary/cursor resource bindings.
+ */
+struct virtio_gpu_scanout_info {
+    uint32_t width, height;
+    uint32_t enabled;
+    uint32_t primary_resource_id;
+    uint32_t cursor_resource_id;
+    uint32_t src_x, src_y, src_w, src_h;
+};
+
+typedef struct {
+    struct virtio_gpu_scanout_info scanouts[VIRTIO_GPU_MAX_SCANOUTS];
+    uint32_t num_scanouts;
+} virtio_gpu_data_t;
+
+PACKED(struct virtio_gpu_config {
+    uint32_t events_read;
+    uint32_t events_clear;
+    uint32_t num_scanouts;
+    uint32_t num_capsets;
+});
+
+PACKED(struct virtio_gpu_ctrl_hdr {
+    uint32_t type;
+    uint32_t flags;
+    uint64_t fence_id;
+    uint32_t ctx_id;
+    uint8_t ring_idx;
+    uint8_t padding[3];
+});
+
+PACKED(struct virtio_gpu_rect {
+    uint32_t x;
+    uint32_t y;
+    uint32_t width;
+    uint32_t height;
+});
+
+PACKED(struct virtio_gpu_resp_disp_info {
+    struct virtio_gpu_ctrl_hdr hdr;
+    struct virtio_gpu_display_one {
+        struct virtio_gpu_rect r;
+        uint32_t enabled;
+        uint32_t flags;
+    } pmodes[VIRTIO_GPU_MAX_SCANOUTS];
+});
+
+PACKED(struct virtio_gpu_res_create_2d {
+    struct virtio_gpu_ctrl_hdr hdr;
+    uint32_t resource_id;
+    uint32_t format;
+    uint32_t width;
+    uint32_t height;
+});
+
+PACKED(struct virtio_gpu_res_unref {
+    struct virtio_gpu_ctrl_hdr hdr;
+    uint32_t resource_id;
+    uint32_t padding;
+});
+
+PACKED(struct virtio_gpu_set_scanout {
+    struct virtio_gpu_ctrl_hdr hdr;
+    struct virtio_gpu_rect r;
+    uint32_t scanout_id;
+    uint32_t resource_id;
+});
+
+PACKED(struct virtio_gpu_res_flush {
+    struct virtio_gpu_ctrl_hdr hdr;
+    struct virtio_gpu_rect r;
+    uint32_t resource_id;
+    uint32_t padding;
+});
+
+PACKED(struct virtio_gpu_trans_to_host_2d {
+    struct virtio_gpu_ctrl_hdr hdr;
+    struct virtio_gpu_rect r;
+    uint64_t offset;
+    uint32_t resource_id;
+    uint32_t padding;
+});
+
+PACKED(struct virtio_gpu_res_attach_backing {
+    struct virtio_gpu_ctrl_hdr hdr;
+    uint32_t resource_id;
+    uint32_t nr_entries;
+});
+
+PACKED(struct virtio_gpu_res_detach_backing {
+    struct virtio_gpu_ctrl_hdr hdr;
+    uint32_t resource_id;
+    uint32_t padding;
+});
+
+PACKED(struct virtio_gpu_mem_entry {
+    uint64_t addr;
+    uint32_t length;
+    uint32_t padding;
+});
+
+PACKED(struct virtio_gpu_cmd_get_edid {
+    struct virtio_gpu_ctrl_hdr hdr;
+    uint32_t scanout;
+    uint32_t padding;
+});
+
+PACKED(struct virtio_gpu_resp_edid {
+    struct virtio_gpu_ctrl_hdr hdr;
+    uint32_t size;
+    uint32_t padding;
+    char edid[1024];
+});
+
+PACKED(struct virtio_gpu_get_capset_info {
+    struct virtio_gpu_ctrl_hdr hdr;
+    uint32_t capset_index;
+    uint32_t padding;
+});
+
+PACKED(struct virtio_gpu_resp_capset_info {
+    struct virtio_gpu_ctrl_hdr hdr;
+    uint32_t capset_id;
+    uint32_t capset_max_version;
+    uint32_t capset_max_size;
+    uint32_t padding;
+});
+
+PACKED(struct virtio_gpu_get_capset {
+    struct virtio_gpu_ctrl_hdr hdr;
+    uint32_t capset_id;
+    uint32_t capset_version;
+});
+
+PACKED(struct virtio_gpu_resp_capset {
+    struct virtio_gpu_ctrl_hdr hdr;
+    uint8_t capset_data[];
+});
+
+PACKED(struct virtio_gpu_ctx_create {
+    struct virtio_gpu_ctrl_hdr hdr;
+    uint32_t nlen;
+    uint32_t context_init;
+    char debug_name[64];
+});
+
+PACKED(struct virtio_gpu_cursor_pos {
+    uint32_t scanout_id;
+    uint32_t x;
+    uint32_t y;
+    uint32_t padding;
+});
+
+PACKED(struct virtio_gpu_update_cursor {
+    struct virtio_gpu_ctrl_hdr hdr;
+    struct virtio_gpu_cursor_pos pos;
+    uint32_t resource_id;
+    uint32_t hot_x;
+    uint32_t hot_y;
+    uint32_t padding;
+});
+
+/* clang-format off */
+PACKED(struct virtio_gpu_ctx_destroy {
+    struct virtio_gpu_ctrl_hdr hdr;
+});
+/* clang-format on */
+
+PACKED(struct virtio_gpu_resource_create_3d {
+    struct virtio_gpu_ctrl_hdr hdr;
+    uint32_t resource_id;
+    uint32_t target;
+    uint32_t format;
+    uint32_t bind;
+    uint32_t width;
+    uint32_t height;
+    uint32_t depth;
+    uint32_t array_size;
+    uint32_t last_level;
+    uint32_t nr_samples;
+    uint32_t flags;
+    uint32_t padding;
+});
+
+PACKED(struct virtio_gpu_ctx_resource {
+    struct virtio_gpu_ctrl_hdr hdr;
+    uint32_t resource_id;
+    uint32_t padding;
+});
+
+PACKED(struct virtio_gpu_box {
+    uint32_t x;
+    uint32_t y;
+    uint32_t z;
+    uint32_t w;
+    uint32_t h;
+    uint32_t d;
+});
+
+PACKED(struct virtio_gpu_transfer_host_3d {
+    struct virtio_gpu_ctrl_hdr hdr;
+    struct virtio_gpu_box box;
+    uint64_t offset;
+    uint32_t resource_id;
+    uint32_t level;
+    uint32_t stride;
+    uint32_t layer_stride;
+});
+
+PACKED(struct virtio_gpu_cmd_submit {
+    struct virtio_gpu_ctrl_hdr hdr;
+    uint32_t size;
+    uint32_t num_in_fences;
+});
+
+PACKED(struct virtio_gpu_resp_map_info {
+    struct virtio_gpu_ctrl_hdr hdr;
+    uint32_t map_info;
+    uint32_t padding;
+});
+
+enum virtio_gpu_ctrl_type {
+    /* 2D commands */
+    VIRTIO_GPU_CMD_GET_DISPLAY_INFO = 0x0100,
+    VIRTIO_GPU_CMD_RESOURCE_CREATE_2D,
+    VIRTIO_GPU_CMD_RESOURCE_UNREF,
+    VIRTIO_GPU_CMD_SET_SCANOUT,
+    VIRTIO_GPU_CMD_RESOURCE_FLUSH,
+    VIRTIO_GPU_CMD_TRANSFER_TO_HOST_2D,
+    VIRTIO_GPU_CMD_RESOURCE_ATTACH_BACKING,
+    VIRTIO_GPU_CMD_RESOURCE_DETACH_BACKING,
+    VIRTIO_GPU_CMD_GET_CAPSET_INFO,
+    VIRTIO_GPU_CMD_GET_CAPSET,
+    VIRTIO_GPU_CMD_GET_EDID,
+    VIRTIO_GPU_CMD_RESOURCE_ASSIGN_UUID,
+    VIRTIO_GPU_CMD_RESOURCE_CREATE_BLOB,
+    VIRTIO_GPU_CMD_SET_SCANOUT_BLOB,
+
+    /* 3D commands */
+    VIRTIO_GPU_CMD_CTX_CREATE = 0x0200,
+    VIRTIO_GPU_CMD_CTX_DESTROY,
+    VIRTIO_GPU_CMD_CTX_ATTACH_RESOURCE,
+    VIRTIO_GPU_CMD_CTX_DETACH_RESOURCE,
+    VIRTIO_GPU_CMD_RESOURCE_CREATE_3D,
+    VIRTIO_GPU_CMD_TRANSFER_TO_HOST_3D,
+    VIRTIO_GPU_CMD_TRANSFER_FROM_HOST_3D,
+    VIRTIO_GPU_CMD_SUBMIT_3D,
+    VIRTIO_GPU_CMD_RESOURCE_MAP_BLOB,
+    VIRTIO_GPU_CMD_RESOURCE_UNMAP_BLOB,
+
+    /* Cursor commands */
+    VIRTIO_GPU_CMD_UPDATE_CURSOR = 0x0300,
+    VIRTIO_GPU_CMD_MOVE_CURSOR,
+
+    /* Success responses */
+    VIRTIO_GPU_RESP_OK_NODATA = 0x1100,
+    VIRTIO_GPU_RESP_OK_DISPLAY_INFO,
+    VIRTIO_GPU_RESP_OK_CAPSET_INFO,
+    VIRTIO_GPU_RESP_OK_CAPSET,
+    VIRTIO_GPU_RESP_OK_EDID,
+
+    /* Error responses */
+    VIRTIO_GPU_RESP_ERR_UNSPEC = 0x1200,
+    VIRTIO_GPU_RESP_ERR_OUT_OF_MEMORY,
+    VIRTIO_GPU_RESP_ERR_INVALID_SCANOUT_ID,
+    VIRTIO_GPU_RESP_ERR_INVALID_RESOURCE_ID,
+    VIRTIO_GPU_RESP_ERR_INVALID_CONTEXT_ID,
+    VIRTIO_GPU_RESP_ERR_INVALID_PARAMETER,
+};
+
+enum virtio_gpu_formats {
+    VIRTIO_GPU_FORMAT_B8G8R8A8_UNORM = 1,
+    VIRTIO_GPU_FORMAT_B8G8R8X8_UNORM = 2,
+    VIRTIO_GPU_FORMAT_A8R8G8B8_UNORM = 3,
+    VIRTIO_GPU_FORMAT_X8R8G8B8_UNORM = 4,
+    VIRTIO_GPU_FORMAT_R8G8B8A8_UNORM = 67,
+    VIRTIO_GPU_FORMAT_X8B8G8R8_UNORM = 68,
+    VIRTIO_GPU_FORMAT_A8B8G8R8_UNORM = 121,
+    VIRTIO_GPU_FORMAT_R8G8B8X8_UNORM = 134
+};
+
+typedef void (*virtio_gpu_cmd_func)(virtio_gpu_state_t *vgpu,
+                                    struct virtq_desc *vq_desc,
+                                    uint32_t *plen);
+typedef void (*virtio_gpu_backend_lifecycle_func)(virtio_gpu_state_t *vgpu);
+
+struct virtio_gpu_cmd_backend {
+    virtio_gpu_backend_lifecycle_func reset;
+    /* 2D commands */
+    virtio_gpu_cmd_func get_display_info;
+    virtio_gpu_cmd_func resource_create_2d;
+    virtio_gpu_cmd_func resource_unref;
+    virtio_gpu_cmd_func set_scanout;
+    virtio_gpu_cmd_func resource_flush;
+    virtio_gpu_cmd_func transfer_to_host_2d;
+    virtio_gpu_cmd_func resource_attach_backing;
+    virtio_gpu_cmd_func resource_detach_backing;
+    virtio_gpu_cmd_func get_capset_info;
+    virtio_gpu_cmd_func get_capset;
+    virtio_gpu_cmd_func get_edid;
+    virtio_gpu_cmd_func resource_assign_uuid;
+    virtio_gpu_cmd_func resource_create_blob;
+    virtio_gpu_cmd_func set_scanout_blob;
+    /* 3D commands */
+    virtio_gpu_cmd_func ctx_create;
+    virtio_gpu_cmd_func ctx_destroy;
+    virtio_gpu_cmd_func ctx_attach_resource;
+    virtio_gpu_cmd_func ctx_detach_resource;
+    virtio_gpu_cmd_func resource_create_3d;
+    virtio_gpu_cmd_func transfer_to_host_3d;
+    virtio_gpu_cmd_func transfer_from_host_3d;
+    virtio_gpu_cmd_func submit_3d;
+    virtio_gpu_cmd_func resource_map_blob;
+    virtio_gpu_cmd_func resource_unmap_blob;
+    /* Cursor commands */
+    virtio_gpu_cmd_func update_cursor;
+    virtio_gpu_cmd_func move_cursor;
+};
+
+void *virtio_gpu_mem_guest_to_host(virtio_gpu_state_t *vgpu,
+                                   uint32_t addr,
+                                   uint32_t size);
+void *virtio_gpu_get_request(virtio_gpu_state_t *vgpu,
+                             struct virtq_desc *vq_desc,
+                             size_t request_size);
+int virtio_gpu_get_response_desc(struct virtq_desc *vq_desc,
+                                 int max_desc,
+                                 size_t response_size);
+uint32_t virtio_gpu_write_ctrl_response(
+    virtio_gpu_state_t *vgpu,
+    const struct virtio_gpu_ctrl_hdr *request,
+    const struct virtq_desc *response_desc,
+    uint32_t type);
+
+void virtio_gpu_set_fail(virtio_gpu_state_t *vgpu);
+
+void virtio_gpu_get_display_info_handler(virtio_gpu_state_t *vgpu,
+                                         struct virtq_desc *vq_desc,
+                                         uint32_t *plen);
+void virtio_gpu_get_edid_handler(virtio_gpu_state_t *vgpu,
+                                 struct virtq_desc *vq_desc,
+                                 uint32_t *plen);
+void virtio_gpu_cmd_undefined_handler(virtio_gpu_state_t *vgpu,
+                                      struct virtq_desc *vq_desc,
+                                      uint32_t *plen);
diff --git a/window-sw.c b/window-sw.c
index 3d464034..7e9fdfdc 100644
--- a/window-sw.c
+++ b/window-sw.c
@@ -1,23 +1,68 @@
 #include <SDL.h>
+#include <inttypes.h>
+#include <limits.h>
 #include <stdbool.h>
+#include <stdint.h>
 #include <stdio.h>
-#include <stdlib.h>
+#include <string.h>
 #include <unistd.h>
 
-#include "device.h"
-#include "feature.h"
+#if SEMU_HAS(VIRTIOGPU)
+#include "vgpu-display.h"
+#include "virtio-gpu.h"
+#endif
+#if SEMU_HAS(VIRTIOINPUT)
 #include "virtio-input-event.h"
+#endif
 #include "window.h"
 
-static SDL_Window *sdl_window;
+#define WINDOW_LOG_PREFIX "[SEMU WINDOW] "
+
 static int wake_write_fd = -1;
+static bool sdl_initialized = false;
 static bool headless_mode = false;
-static bool mouse_grabbed = false;
 static bool should_exit = false;
 
-/* The backend only needs the pipe's write end. The emulator owns the read end
- * and drains it after poll() returns.
+#if SEMU_HAS(VIRTIOINPUT)
+static bool mouse_grabbed = false;
+static SDL_Window *sdl_input_window;
+#else
+#define SDL_EVENT_WAIT_TIMEOUT_MS 1 /* ms */
+#define SDL_EVENT_BURST_LIMIT 64U
+#endif
+
+#if SEMU_HAS(VIRTIOGPU)
+/* SDL-owned retained state for a single plane. Textures live only on the SDL
+ * thread and are updated from immutable CPU-frame display resources.
  */
+struct sdl_plane_info {
+    uint32_t width;
+    uint32_t height;
+    uint32_t sdl_format;
+    bool alpha_blend;
+    SDL_Texture *texture;
+};
+
+/* SDL-owned retained state for one scanout. 'window_init_sw()' creates the
+ * window/renderer, then 'window_drain_display_queue()' updates the primary and
+ * cursor planes from queued display payloads before rendering them.
+ */
+struct sdl_scanout_info {
+    struct sdl_plane_info primary_plane;
+    struct sdl_plane_info cursor_plane;
+    SDL_Rect cursor_rect;
+    uint32_t cursor_hot_x;
+    uint32_t cursor_hot_y;
+    uint32_t window_width;
+    uint32_t window_height;
+
+    SDL_Window *window;
+    SDL_Renderer *renderer;
+};
+
+static struct sdl_scanout_info sdl_scanouts[VIRTIO_GPU_MAX_SCANOUTS];
+#endif
+
 static void window_set_wake_fd_sw(int fd)
 {
     wake_write_fd = fd;
@@ -35,13 +80,13 @@ static void window_wake_backend_sw(void)
     }
 }
 
-static inline void window_shutdown_sw(void)
+static void window_shutdown_sw(void)
 {
     /* Both user-driven close and emulator-driven shutdown funnel through the
      * same flag so the main thread and emulator thread observe one exit state.
      */
     __atomic_store_n(&should_exit, true, __ATOMIC_RELAXED);
-    /* Unblock any poll(-1) in the SMP emulator loop immediately. */
+    /* Unblock any 'poll(-1)' in the SMP emulator loop immediately. */
     window_wake_backend_sw();
 }
 
@@ -50,13 +95,14 @@ static bool window_is_closed_sw(void)
     return __atomic_load_n(&should_exit, __ATOMIC_RELAXED);
 }
 
+#if SEMU_HAS(VIRTIOINPUT)
 /* Main-thread-only helper for relative-pointer devices. SDL's grab and
  * relative mouse APIs are part of the windowing backend, so callers use this
  * to switch between normal host-pointer mode and guest-directed mouse mode.
  */
 static void window_set_mouse_grab_sw(bool grabbed)
 {
-    if (headless_mode || !sdl_window) {
+    if (headless_mode || !sdl_input_window) {
         mouse_grabbed = false;
         return;
     }
@@ -72,10 +118,10 @@ static void window_set_mouse_grab_sw(bool grabbed)
                     SDL_GetError());
             return;
         }
-        SDL_SetWindowGrab(sdl_window, SDL_TRUE);
+        SDL_SetWindowGrab(sdl_input_window, SDL_TRUE);
         SDL_ShowCursor(SDL_DISABLE);
     } else {
-        SDL_SetWindowGrab(sdl_window, SDL_FALSE);
+        SDL_SetWindowGrab(sdl_input_window, SDL_FALSE);
         SDL_SetRelativeMouseMode(SDL_FALSE);
         SDL_ShowCursor(SDL_ENABLE);
     }
@@ -87,13 +133,335 @@ static bool window_is_mouse_grabbed_sw(void)
 {
     return mouse_grabbed;
 }
+#endif
+
+#if SEMU_HAS(VIRTIOGPU)
+static bool vgpu_format_to_sdl_format(enum virtio_gpu_formats virtio_gpu_format,
+                                      uint32_t *sdl_format)
+{
+    switch (virtio_gpu_format) {
+    case VIRTIO_GPU_FORMAT_B8G8R8A8_UNORM:
+        *sdl_format = SDL_PIXELFORMAT_ARGB8888;
+        return true;
+    case VIRTIO_GPU_FORMAT_B8G8R8X8_UNORM:
+        *sdl_format = SDL_PIXELFORMAT_XRGB8888;
+        return true;
+    case VIRTIO_GPU_FORMAT_A8R8G8B8_UNORM:
+        *sdl_format = SDL_PIXELFORMAT_BGRA8888;
+        return true;
+    case VIRTIO_GPU_FORMAT_X8R8G8B8_UNORM:
+        *sdl_format = SDL_PIXELFORMAT_BGRX8888;
+        return true;
+    case VIRTIO_GPU_FORMAT_R8G8B8A8_UNORM:
+        *sdl_format = SDL_PIXELFORMAT_ABGR8888;
+        return true;
+    case VIRTIO_GPU_FORMAT_X8B8G8R8_UNORM:
+        *sdl_format = SDL_PIXELFORMAT_RGBX8888;
+        return true;
+    case VIRTIO_GPU_FORMAT_A8B8G8R8_UNORM:
+        *sdl_format = SDL_PIXELFORMAT_RGBA8888;
+        return true;
+    case VIRTIO_GPU_FORMAT_R8G8B8X8_UNORM:
+        *sdl_format = SDL_PIXELFORMAT_XBGR8888;
+        return true;
+    default:
+        return false;
+    }
+}
+
+static void sdl_plane_info_reset(struct sdl_plane_info *plane)
+{
+    bool alpha_blend = plane->alpha_blend;
+    if (plane->texture)
+        SDL_DestroyTexture(plane->texture);
+    memset(plane, 0, sizeof(*plane));
+    plane->alpha_blend = alpha_blend;
+}
+
+static void sdl_plane_info_cleanup(struct sdl_plane_info *plane)
+{
+    if (plane->texture)
+        SDL_DestroyTexture(plane->texture);
+    memset(plane, 0, sizeof(*plane));
+}
+
+static void sdl_scanout_info_cleanup(struct sdl_scanout_info *scanout)
+{
+    sdl_plane_info_cleanup(&scanout->primary_plane);
+    sdl_plane_info_cleanup(&scanout->cursor_plane);
+
+    if (scanout->renderer)
+        SDL_DestroyRenderer(scanout->renderer);
+    if (scanout->window)
+        SDL_DestroyWindow(scanout->window);
+
+    memset(scanout, 0, sizeof(*scanout));
+}
+
+static bool sdl_plane_info_get_sdl_format(
+    const struct sdl_plane_info *plane,
+    const struct vgpu_display_payload *payload,
+    uint32_t *sdl_format)
+{
+    /* The plane keeps its SDL objects across frames, but the payload format is
+     * still per-update data. Resolve the incoming VirtIO-GPU format first,
+     * then adjust it below if this plane requires alpha.
+     */
+    const struct vgpu_display_cpu_payload *frame = &payload->cpu;
+    if (!vgpu_format_to_sdl_format(frame->format, sdl_format)) {
+        fprintf(stderr, "%s(): invalid resource format %u\n", __func__,
+                (uint32_t) frame->format);
+        return false;
+    }
+
+    /* Cursor textures need an alpha-capable SDL format. If the incoming format
+     * is an XRGB/XBGR/BGRX/RGBX variant, switch to the matching alpha version
+     * so the high byte is preserved as transparency instead of being ignored.
+     */
+    if (plane->alpha_blend) {
+        switch (*sdl_format) {
+        case SDL_PIXELFORMAT_XRGB8888:
+            *sdl_format = SDL_PIXELFORMAT_ARGB8888;
+            break;
+        case SDL_PIXELFORMAT_BGRX8888:
+            *sdl_format = SDL_PIXELFORMAT_BGRA8888;
+            break;
+        case SDL_PIXELFORMAT_RGBX8888:
+            *sdl_format = SDL_PIXELFORMAT_RGBA8888;
+            break;
+        case SDL_PIXELFORMAT_XBGR8888:
+            *sdl_format = SDL_PIXELFORMAT_ABGR8888;
+            break;
+        default:
+            break;
+        }
+    }
+
+    return true;
+}
+
+static SDL_Texture *sdl_plane_info_create_texture(
+    SDL_Renderer *renderer,
+    const struct sdl_plane_info *plane,
+    const struct vgpu_display_cpu_payload *frame,
+    uint32_t sdl_format)
+{
+    SDL_Texture *texture =
+        SDL_CreateTexture(renderer, sdl_format, SDL_TEXTUREACCESS_STREAMING,
+                          frame->width, frame->height);
+    if (!texture) {
+        fprintf(stderr, "%s(): failed to create texture: %s\n", __func__,
+                SDL_GetError());
+        return NULL;
+    }
+
+    if (plane->alpha_blend) {
+        if (SDL_SetTextureBlendMode(texture, SDL_BLENDMODE_BLEND) < 0) {
+            fprintf(stderr, "%s(): failed to enable texture blending: %s\n",
+                    __func__, SDL_GetError());
+        }
+    }
+
+    return texture;
+}
+
+static bool sdl_plane_info_update_texture(
+    SDL_Renderer *renderer,
+    struct sdl_plane_info *plane,
+    const struct vgpu_display_payload *payload,
+    const char *plane_name)
+{
+    const struct vgpu_display_cpu_payload *frame = &payload->cpu;
+    uint32_t sdl_format;
+    if (!sdl_plane_info_get_sdl_format(plane, payload, &sdl_format))
+        return false;
+
+    bool reuse_texture = plane->texture && plane->width == frame->width &&
+                         plane->height == frame->height &&
+                         plane->sdl_format == sdl_format;
+    SDL_Texture *texture = plane->texture;
+
+    if (!reuse_texture) {
+        texture =
+            sdl_plane_info_create_texture(renderer, plane, frame, sdl_format);
+        if (!texture)
+            return false;
+    }
+
+    /* Keep the retained plane state unchanged until the new pixels are known
+     * to be uploaded successfully.
+     */
+    if (SDL_UpdateTexture(texture, NULL, frame->pixels, frame->stride) != 0) {
+        fprintf(stderr, "%s(): failed to update %s texture: %s\n", __func__,
+                plane_name, SDL_GetError());
+        if (!reuse_texture)
+            SDL_DestroyTexture(texture);
+        return false;
+    }
+
+    if (!reuse_texture) {
+        if (plane->texture)
+            SDL_DestroyTexture(plane->texture);
+        plane->texture = texture;
+    }
+    plane->width = frame->width;
+    plane->height = frame->height;
+    plane->sdl_format = sdl_format;
+    return true;
+}
+
+static bool sdl_cursor_rect_update_position(SDL_Rect *rect,
+                                            int32_t x,
+                                            int32_t y,
+                                            uint32_t hot_x,
+                                            uint32_t hot_y)
+{
+    int64_t rect_x = (int64_t) x - (int64_t) hot_x;
+    int64_t rect_y = (int64_t) y - (int64_t) hot_y;
+
+    if (rect_x < INT_MIN || rect_x > INT_MAX || rect_y < INT_MIN ||
+        rect_y > INT_MAX) {
+        fprintf(stderr,
+                WINDOW_LOG_PREFIX
+                "%s(): cursor position out of SDL range "
+                "(x=%" PRId32 " y=%" PRId32 " hot_x=%u hot_y=%u)\n",
+                __func__, x, y, (unsigned) hot_x, (unsigned) hot_y);
+        return false;
+    }
+
+    rect->x = (int) rect_x;
+    rect->y = (int) rect_y;
+    return true;
+}
+
+static bool sdl_scanout_apply_cursor_frame(
+    struct sdl_scanout_info *scanout,
+    const struct vgpu_display_payload *payload,
+    int32_t x,
+    int32_t y,
+    uint32_t hot_x,
+    uint32_t hot_y)
+{
+    const struct vgpu_display_cpu_payload *frame = &payload->cpu;
+    struct sdl_plane_info *plane = &scanout->cursor_plane;
+    SDL_Rect new_cursor_rect = scanout->cursor_rect;
+
+    if (frame->width > INT_MAX || frame->height > INT_MAX) {
+        fprintf(stderr,
+                WINDOW_LOG_PREFIX
+                "%s(): cursor size out of SDL range (%ux%u)\n",
+                __func__, frame->width, frame->height);
+        return false;
+    }
+
+    if (!sdl_cursor_rect_update_position(&new_cursor_rect, x, y, hot_x, hot_y))
+        return false;
+
+    if (!sdl_plane_info_update_texture(scanout->renderer, plane, payload,
+                                       "cursor"))
+        return false;
+
+    scanout->cursor_hot_x = hot_x;
+    scanout->cursor_hot_y = hot_y;
+    new_cursor_rect.w = (int) frame->width;
+    new_cursor_rect.h = (int) frame->height;
+    scanout->cursor_rect = new_cursor_rect;
+    return true;
+}
+
+static void sdl_scanout_render(const struct sdl_scanout_info *scanout)
+{
+    SDL_RenderClear(scanout->renderer);
+
+    if (scanout->primary_plane.texture)
+        SDL_RenderCopy(scanout->renderer, scanout->primary_plane.texture, NULL,
+                       NULL);
+
+    if (scanout->cursor_plane.texture)
+        SDL_RenderCopy(scanout->renderer, scanout->cursor_plane.texture, NULL,
+                       &scanout->cursor_rect);
+
+    SDL_RenderPresent(scanout->renderer);
+}
+
+static void window_drain_display_queue(void)
+{
+    bool dirty_scanouts[VIRTIO_GPU_MAX_SCANOUTS] = {0};
+    struct vgpu_display_cmd cmd;
+
+    /* Drain display bridge commands, update only SDL-owned state, then render
+     * each affected scanout once. The bridge publishes reliable clear
+     * generations and filters stale lossy frame/move queue entries.
+     */
+    while (vgpu_display_pop_cmd(&cmd)) {
+        /* 'scanout_id' was validated by the guest-facing backend before the
+         * command entered the display bridge.
+         */
+        struct sdl_scanout_info *scanout = &sdl_scanouts[cmd.scanout_id];
+        if (!scanout->window || !scanout->renderer) {
+            vgpu_display_release_cmd(&cmd);
+            continue;
+        }
+
+        switch (cmd.type) {
+        case VGPU_DISPLAY_CMD_PRIMARY_CLEAR:
+            sdl_plane_info_reset(&scanout->primary_plane);
+            dirty_scanouts[cmd.scanout_id] = true;
+            break;
+        case VGPU_DISPLAY_CMD_CURSOR_CLEAR:
+            memset(&scanout->cursor_rect, 0, sizeof(scanout->cursor_rect));
+            scanout->cursor_hot_x = 0;
+            scanout->cursor_hot_y = 0;
+            sdl_plane_info_reset(&scanout->cursor_plane);
+            dirty_scanouts[cmd.scanout_id] = true;
+            break;
+        case VGPU_DISPLAY_CMD_PRIMARY_SET:
+            /* Use '|=' to keep earlier dirty state for this scanout. A failed
+             * upload leaves the old texture visible and does not dirty the
+             * scanout by itself.
+             */
+            dirty_scanouts[cmd.scanout_id] |= sdl_plane_info_update_texture(
+                scanout->renderer, &scanout->primary_plane,
+                cmd.u.primary_set.payload, "primary");
+            break;
+        case VGPU_DISPLAY_CMD_CURSOR_SET:
+            /* Use '|=' to keep earlier dirty state for this scanout. A failed
+             * upload leaves the old cursor visible and does not dirty the
+             * scanout by itself.
+             */
+            dirty_scanouts[cmd.scanout_id] |= sdl_scanout_apply_cursor_frame(
+                scanout, cmd.u.cursor_set.payload, cmd.u.cursor_set.x,
+                cmd.u.cursor_set.y, cmd.u.cursor_set.hot_x,
+                cmd.u.cursor_set.hot_y);
+            break;
+        case VGPU_DISPLAY_CMD_CURSOR_MOVE:
+            if (!sdl_cursor_rect_update_position(
+                    &scanout->cursor_rect, cmd.u.cursor_move.x,
+                    cmd.u.cursor_move.y, scanout->cursor_hot_x,
+                    scanout->cursor_hot_y))
+                break;
+            dirty_scanouts[cmd.scanout_id] = true;
+            break;
+        }
+
+        vgpu_display_release_cmd(&cmd);
+    }
+
+    for (uint32_t i = 0; i < VIRTIO_GPU_MAX_SCANOUTS; i++) {
+        if (!dirty_scanouts[i] || !sdl_scanouts[i].window ||
+            !sdl_scanouts[i].renderer)
+            continue;
+        sdl_scanout_render(&sdl_scanouts[i]);
+    }
+}
+#endif
 
 /* Main loop runs on the main thread */
 static void window_main_loop_sw(void)
 {
     if (headless_mode) {
-        /* Block until the emulator calls window_shutdown_sw(), so main() can
-         * proceed to pthread_join() rather than stopping the emulator
+        /* Block until the emulator calls 'window_shutdown_sw()', so 'main()'
+         * can proceed to 'pthread_join()' rather than stopping the emulator
          * immediately. There is no SDL event loop in this mode, so the main
          * thread just polls the shared close flag.
          */
@@ -103,27 +471,52 @@ static void window_main_loop_sw(void)
     }
 
     /* relaxed ordering is sufficient: the only consequence of reading a stale
-     * false is a few extra loop iterations (each blocked up to 1 ms inside
-     * SDL_WaitEventTimeout). Ordering with the emulator thread is provided by
-     * pthread_join(), not by this flag.
+     * false is a few extra loop iterations. Ordering with the emulator thread
+     * is provided by 'pthread_join()', not by this flag.
      */
     while (!window_is_closed_sw()) {
+#if SEMU_HAS(VIRTIOINPUT)
         if (vinput_handle_events()) {
-            /* User closed the window. Set the flag so window_shutdown_sw()
+            /* User closed the window. Set the flag so 'window_shutdown_sw()'
              * (called from the emulator thread) does not race with us, then
-             * return normally so main() can pthread_join the emulator thread
-             * and collect its exit code.
+             * return normally so 'main()' can 'pthread_join()' the emulator
+             * thread and collect its exit code.
              */
             window_shutdown_sw();
             return;
         }
+#else
+        SDL_Event e;
+        /* Without 'virtio-input', there is no SDL event pump to wake on display
+         * commands. Use a short timeout so 'VIRTIOGPU'-only builds periodically
+         * drain the display bridge; a future SDL user-event bridge could make
+         * this fully event-driven.
+         */
+        if (SDL_WaitEventTimeout(&e, SDL_EVENT_WAIT_TIMEOUT_MS)) {
+            uint32_t processed = 0;
+            do {
+                if (e.type == SDL_QUIT) {
+                    window_shutdown_sw();
+                    return;
+                }
+                processed++;
+            } while (processed < SDL_EVENT_BURST_LIMIT && SDL_PollEvent(&e));
+        }
+#endif
+
+#if SEMU_HAS(VIRTIOGPU)
+        window_drain_display_queue();
+#endif
     }
 }
 
-static void window_init_sw(bool headless)
+static void window_init_sw(bool headless, uint32_t width, uint32_t height)
 {
     if (headless) {
         headless_mode = true;
+#if SEMU_HAS(VIRTIOGPU)
+        vgpu_display_set_unavailable();
+#endif
         return;
     }
 
@@ -133,33 +526,141 @@ static void window_init_sw(bool headless)
                 "Running in headless mode.\n",
                 SDL_GetError());
         headless_mode = true;
+#if SEMU_HAS(VIRTIOGPU)
+        vgpu_display_set_unavailable();
+#endif
         return;
     }
+    sdl_initialized = true;
 
-    sdl_window = SDL_CreateWindow("semu", SDL_WINDOWPOS_UNDEFINED,
-                                  SDL_WINDOWPOS_UNDEFINED, SCREEN_WIDTH,
-                                  SCREEN_HEIGHT, SDL_WINDOW_SHOWN);
-    if (!sdl_window) {
+#if SEMU_HAS(VIRTIOGPU)
+    /* The current machine setup registers exactly one scanout before calling
+     * 'window_init_sw()', so materialize scanout 0 directly here. If semu grows
+     * multiple scanouts later, this can be extended to iterate all registered
+     * scanouts or restored to an explicit per-scanout setup path.
+     */
+    struct sdl_scanout_info *scanout = &sdl_scanouts[0];
+    scanout->window = SDL_CreateWindow("semu", SDL_WINDOWPOS_UNDEFINED,
+                                       SDL_WINDOWPOS_UNDEFINED, width, height,
+                                       SDL_WINDOW_SHOWN);
+    if (!scanout->window) {
+        fprintf(stderr,
+                "window_init_sw(): failed to create SDL window for display "
+                "0: %s\n"
+                "Running in headless mode.\n",
+                SDL_GetError());
+        headless_mode = true;
+        SDL_Quit();
+        sdl_initialized = false;
+        vgpu_display_set_unavailable();
+        return;
+    }
+
+    scanout->renderer =
+        SDL_CreateRenderer(scanout->window, -1, SDL_RENDERER_ACCELERATED);
+    if (!scanout->renderer) {
+        fprintf(stderr,
+                "window_init_sw(): accelerated renderer not available, "
+                "trying software renderer: %s\n",
+                SDL_GetError());
+        scanout->renderer =
+            SDL_CreateRenderer(scanout->window, -1, SDL_RENDERER_SOFTWARE);
+    }
+    if (!scanout->renderer) {
+        fprintf(stderr,
+                "window_init_sw(): failed to create renderer for display "
+                "0: %s\n"
+                "Running in headless mode.\n",
+                SDL_GetError());
+        SDL_DestroyWindow(scanout->window);
+        scanout->window = NULL;
+        headless_mode = true;
+        SDL_Quit();
+        sdl_initialized = false;
+        vgpu_display_set_unavailable();
+        return;
+    }
+
+    scanout->window_width = width;
+    scanout->window_height = height;
+    scanout->cursor_plane.alpha_blend = true;
+
+#if SEMU_HAS(VIRTIOINPUT)
+    if (!sdl_input_window)
+        sdl_input_window = scanout->window;
+#endif
+
+    SDL_SetRenderDrawColor(scanout->renderer, 0, 0, 0, 255);
+    SDL_RenderClear(scanout->renderer);
+    SDL_RenderPresent(scanout->renderer);
+#else /* !SEMU_HAS(VIRTIOGPU) */
+    sdl_input_window = SDL_CreateWindow("semu", SDL_WINDOWPOS_UNDEFINED,
+                                        SDL_WINDOWPOS_UNDEFINED, width, height,
+                                        SDL_WINDOW_SHOWN);
+    if (!sdl_input_window) {
         fprintf(stderr,
                 "window_init_sw(): failed to create SDL window: %s\n"
                 "Running in headless mode.\n",
                 SDL_GetError());
         headless_mode = true;
+        SDL_Quit();
+        sdl_initialized = false;
         return;
     }
+#endif
+}
+
+static void window_cleanup_sw(void)
+{
+#if SEMU_HAS(VIRTIOINPUT)
+    if (sdl_initialized)
+        window_set_mouse_grab_sw(false);
+    /* Keep cleanup idempotent when SDL was never initialized or grab release
+     * returned early.
+     */
+    mouse_grabbed = false;
+#endif
 
-    fprintf(stderr,
-            "semu: click window to capture mouse, Ctrl+Alt+G to "
-            "release\n");
+    wake_write_fd = -1;
+
+#if SEMU_HAS(VIRTIOGPU)
+    for (uint32_t i = 0; i < VIRTIO_GPU_MAX_SCANOUTS; i++)
+        sdl_scanout_info_cleanup(&sdl_scanouts[i]);
+
+    struct vgpu_display_cmd cmd;
+    while (vgpu_display_pop_cmd(&cmd))
+        vgpu_display_release_cmd(&cmd);
+#elif SEMU_HAS(VIRTIOINPUT)
+    if (sdl_input_window)
+        SDL_DestroyWindow(sdl_input_window);
+#endif
+
+#if SEMU_HAS(VIRTIOINPUT)
+    sdl_input_window = NULL;
+#endif
+
+    if (sdl_initialized) {
+        SDL_Quit();
+        sdl_initialized = false;
+    }
+
+    /* Cleanup normally runs before process exit. Reset frontend flags anyway
+     * so a future re-init path cannot inherit stale headless/shutdown state.
+     */
+    headless_mode = false;
+    should_exit = false;
 }
 
 const struct window_backend g_window = {
     .window_init = window_init_sw,
     .window_main_loop = window_main_loop_sw,
     .window_shutdown = window_shutdown_sw,
+    .window_cleanup = window_cleanup_sw,
     .window_is_closed = window_is_closed_sw,
     .window_set_wake_fd = window_set_wake_fd_sw,
     .window_wake_backend = window_wake_backend_sw,
+#if SEMU_HAS(VIRTIOINPUT)
     .window_set_mouse_grab = window_set_mouse_grab_sw,
     .window_is_mouse_grabbed = window_is_mouse_grabbed_sw,
+#endif
 };
diff --git a/window.h b/window.h
index 5f2e51e0..16064a39 100644
--- a/window.h
+++ b/window.h
@@ -1,47 +1,55 @@
 #pragma once
 
 #include <stdbool.h>
+#include <stdint.h>
 
-#include "feature.h"
+#if SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU)
 
-#if SEMU_HAS(VIRTIOINPUT)
 struct window_backend {
     /* When headless is true, the backend skips SDL_Init / window creation and
      * behaves as if SDL had failed -- useful for batch runs (CI, 'make check')
      * that have no display attached.
+     * The caller also passes the default SDL window size. VirtIO-GPU builds
+     * use it as the initial scanout size; input-only builds use it for the
+     * grab target window because they do not have a display mode of their own.
      */
-    void (*window_init)(bool headless);
-    /* Main loop function that runs on the main thread (for macOS SDL2).
-     * If non-NULL, the emulator runs in a background thread while this
-     * function handles window events on the main thread.
+    void (*window_init)(bool headless, uint32_t width, uint32_t height);
+    /* Main loop function that runs on the main thread. If non-NULL, the
+     * emulator runs in a background thread while this function handles window
+     * events on the main thread.
      * Returns when the emulator should exit.
      */
     void (*window_main_loop)(void);
-    /* Called from the emulator thread when semu_run() returns, to unblock
-     * window_main_loop() so the main thread can proceed to pthread_join.
+    /* Called from the emulator thread when 'semu_run()' returns, to unblock
+     * 'window_main_loop()' so the main thread can proceed to 'pthread_join()'.
      */
     void (*window_shutdown)(void);
-    /* Returns true once the window has been closed (or SDL failed to
-     * initialize). Safe to call from any thread.
+    /* Release frontend resources after the emulator producer has stopped, or
+     * after initialization fails before the producer starts.
+     */
+    void (*window_cleanup)(void);
+    /* Returns true once the window has been closed (or initialization fell
+     * back to headless mode). Safe to call from any thread.
      */
     bool (*window_is_closed)(void);
-    /* Register the write end of a pipe to be written when the window shuts
-     * down. Must be called before window_main_loop().
+    /* Register the write end of the wake pipe used to break the emulator
+     * thread out of 'poll(-1)' when the backend queues work for it.
      */
     void (*window_set_wake_fd)(int fd);
-    /* Best-effort wakeup hook for the backend self-pipe. */
+    /* Best-effort wakeup hook for the emulator thread. The backend uses this
+     * after queuing work such as input events or shutdown requests.
+     */
     void (*window_wake_backend)(void);
-    /* Enable or disable SDL's relative mouse mode for the frontend window.
-     * When this returns with grab enabled, pointer motion is reported as
-     * relative deltas, the host cursor is hidden, and SDL confines the
-     * pointer to the semu window until the grab is released again.
+#if SEMU_HAS(VIRTIOINPUT)
+    /* Switch the backend between normal host-pointer mode and grabbed
+     * relative-pointer mode. Must be called from the main thread because it
+     * touches window-system state directly.
      */
     void (*window_set_mouse_grab)(bool grabbed);
-    /* Returns true once the frontend window currently owns the host mouse
-     * grab. Safe to call from the main thread while translating SDL events.
-     */
+    /* Returns true once the backend currently owns the host mouse grab. */
     bool (*window_is_mouse_grabbed)(void);
+#endif /* SEMU_HAS(VIRTIOINPUT) */
 };
 
 extern const struct window_backend g_window;
-#endif
+#endif /* SEMU_HAS(VIRTIOINPUT) || SEMU_HAS(VIRTIOGPU) */