From 175d41561c8a9543fcd148a179a77d4828148690 Mon Sep 17 00:00:00 2001 From: AdrianEddy Date: Fri, 1 May 2026 23:57:21 +0200 Subject: [PATCH 1/2] [Metal/hal] Add `metal::Queue::add_wait_event` / `add_signal_event` for interop with external APIs --- CHANGELOG.md | 1 + wgpu-hal/src/metal/adapter.rs | 5 + wgpu-hal/src/metal/command.rs | 8 ++ wgpu-hal/src/metal/mod.rs | 193 ++++++++++++++++++++++++++++++++++ 4 files changed, 207 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c94b7aa225d..e5e223c300d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -99,6 +99,7 @@ By @andyleiserson in [#9321](https://github.com/gfx-rs/wgpu/pull/9321). #### Metal +- Add `metal::Queue::add_wait_event` / `add_signal_event` (with `remove_*` companions) to stage `MTLSharedEvent` waits/signals on the next `Queue::submit`, for GPU-side interop with foreign APIs. Waits run on an internal CB committed before user CBs. By @AdrianEddy in [#9483](https://github.com/gfx-rs/wgpu/pull/9483). - Unconditionally enable `Features::CLIP_DISTANCES`. By @ErichDonGubler in [#9270](https://github.com/gfx-rs/wgpu/pull/9270). - Added full support for mesh shaders, including in WGSL shaders. By @inner-daemons in [#8739](https://github.com/gfx-rs/wgpu/pull/8739). - Fixed structure field names incorrectly ignoring reserved keywords in the Metal (MSL) backend. By @39ali [#9379](https://github.com/gfx-rs/wgpu/pull/9379). diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs index 2a6c2bd2d87..34a159d899d 100644 --- a/wgpu-hal/src/metal/adapter.rs +++ b/wgpu-hal/src/metal/adapter.rs @@ -11,6 +11,8 @@ use wgt::{AstcBlock, AstcChannel}; use alloc::{string::ToString as _, sync::Arc, vec::Vec}; use core::sync::atomic; +use std::sync::OnceLock; +use parking_lot::Mutex; use crate::metal::QueueShared; @@ -124,6 +126,9 @@ impl crate::Adapter for super::Adapter { shared: Arc::new(QueueShared { raw: queue, command_buffer_created_not_submitted: atomic::AtomicUsize::new(0), + pending_waits: Mutex::new(Vec::new()), + pending_signals: Mutex::new(Vec::new()), + relay: OnceLock::new(), }), timestamp_period, }, diff --git a/wgpu-hal/src/metal/command.rs b/wgpu-hal/src/metal/command.rs index 9f2560df72f..447f07df1bd 100644 --- a/wgpu-hal/src/metal/command.rs +++ b/wgpu-hal/src/metal/command.rs @@ -457,6 +457,7 @@ impl crate::CommandEncoder for super::CommandEncoder { unsafe fn begin_encoding(&mut self, label: crate::Label) -> Result<(), crate::DeviceError> { let queue = &self.queue_shared.raw; let retain_references = self.shared.settings.retain_command_buffer_references; + let relay = self.queue_shared.relay.get(); // Guard against exhausting Metal's command buffer budget. Use the hard // limit (`MAX_COMMAND_BUFFERS`) so we fail before Metal can hang inside @@ -487,6 +488,13 @@ impl crate::CommandEncoder for super::CommandEncoder { if let Some(label) = label { cmd_buf_ref.setLabel(Some(&NSString::from_str(label))); } + // If strict event sync is enabled on this queue, gate the + // CB on the relay event at the value the next submit will + // signal. The CB pauses at the start until the relay fires. + if let Some(relay) = relay { + let expected = relay.next_release_value.load(atomic::Ordering::Acquire); + cmd_buf_ref.encodeWaitForEvent_value(relay.event.as_ref(), expected); + } cmd_buf_ref.to_owned() }); diff --git a/wgpu-hal/src/metal/mod.rs b/wgpu-hal/src/metal/mod.rs index d7edd7ce6e3..11f8aea5d4c 100644 --- a/wgpu-hal/src/metal/mod.rs +++ b/wgpu-hal/src/metal/mod.rs @@ -34,6 +34,7 @@ use alloc::{ vec::Vec, }; use core::{fmt, iter, ops, ptr::NonNull, sync::atomic}; +use std::sync::OnceLock; use bitflags::bitflags; use hashbrown::HashMap; @@ -463,10 +464,140 @@ impl Queue { shared: Arc::new(QueueShared { raw, command_buffer_created_not_submitted: atomic::AtomicUsize::new(0), + pending_waits: Mutex::new(Vec::new()), + pending_signals: Mutex::new(Vec::new()), + relay: OnceLock::new(), }), timestamp_period, } } + + /// Returns the underlying `MTLCommandQueue` for direct interop with + /// other Metal-aware libraries. + pub fn as_raw(&self) -> &ProtocolObject { + &self.shared.raw + } + + /// Enable strict GPU-side ordering for [`Self::add_wait_event`]. + /// + /// By default, `add_wait_event` encodes the wait on a separate + /// internal command buffer. Metal allows independent command + /// buffers in a queue to overlap on the GPU, so a wait CB does + /// not strictly gate subsequent user command buffers when those + /// CBs share no Metal-tracked resources with it. Single-stream + /// pipelines often serialize anyway because the GPU has no other + /// concurrent work to fill the slot, but mixed workloads (decode + /// + compute + render) can race. + /// + /// When enabled, every [`crate::CommandEncoder::begin_encoding`] + /// pre-encodes a wait on an internal `MTLSharedEvent` at the start + /// of the new command buffer; every [`crate::Queue::submit`] then + /// signals that event after draining the staged external waits. + /// All command buffers since the previous submit are released in + /// lockstep once the foreign signals arrive, regardless of GPU + /// concurrency. + /// + /// Costs one extra `encodeWaitForEvent` per command buffer plus + /// one extra internal command buffer per submit on this queue. + /// Other queues are unaffected. + /// + /// Idempotent. Cannot be disabled - once enabled, the queue stays + /// in strict mode for its lifetime, since command buffers already + /// encoded would be stranded if the relay stopped firing. + pub fn enable_strict_event_sync(&self) -> Result<(), crate::DeviceError> { + if self.shared.relay.get().is_some() { + return Ok(()); + } + let event = self + .shared + .raw + .device() + .newSharedEvent() + .ok_or(crate::DeviceError::OutOfMemory)?; + let _ = self.shared.relay.set(Relay { + event, + next_release_value: atomic::AtomicU64::new(1), + commit_lock: Mutex::new(()), + }); + Ok(()) + } + + /// Stage an `MTLCommandBuffer::encodeWaitForEvent(event, value)` for + /// the next [`crate::Queue::submit`]. Lets external producers be waited + /// on without a CPU block. + /// + /// By default the wait is encoded onto a dedicated internal command + /// buffer committed before the submit's user CBs - best-effort under + /// cross-CB GPU concurrency, see [`Self::enable_strict_event_sync`] + /// for strict gating. With strict mode enabled, the wait is chained + /// through an internal relay event that gates every user command + /// buffer encoded since the previous submit. + /// + /// Staging is queue-wide, not per-thread or per-submit: any + /// `add_wait_event` call is consumed by whichever + /// [`crate::Queue::submit`] runs next on this queue. If you stage + /// events from multiple threads, coordinate the staging and the + /// submit yourself, or another thread's submit may drain your + /// pending waits. + pub fn add_wait_event(&self, event: Retained>, value: u64) { + self.shared.pending_waits.lock().push((event, value)); + } + + /// Remove `event` from the pending wait list if it is still present. + /// Returns `true` if it was found and removed. + pub fn remove_wait_event(&self, event: &ProtocolObject) -> bool { + let target: *const ProtocolObject = event; + let mut waits = self.shared.pending_waits.lock(); + let before = waits.len(); + waits.retain(|(e, _)| Retained::as_ptr(e) != target); + waits.len() != before + } + + /// Stage an `MTLCommandBuffer::encodeSignalEvent(event, value)` for + /// the next [`crate::Queue::submit`]. The signal is encoded after + /// the submit's own completion signal, so a foreign API waiting on + /// `(event, value)` observes the wgpu work as done. + /// + /// Staging is queue-wide, not per-thread or per-submit: see + /// [`Self::add_wait_event`] for the threading caveat. + pub fn add_signal_event( + &self, + event: Retained>, + value: u64, + ) { + self.shared.pending_signals.lock().push((event, value)); + } + + /// Remove `event` from the pending signal list if it is still present. + /// Returns `true` if it was found and removed. + pub fn remove_signal_event(&self, event: &ProtocolObject) -> bool { + let target: *const ProtocolObject = event; + let mut signals = self.shared.pending_signals.lock(); + let before = signals.len(); + signals.retain(|(e, _)| Retained::as_ptr(e) != target); + signals.len() != before + } +} + +type PendingEvents = Mutex>, u64)>>; + +/// Internal relay used by [`Queue::enable_strict_event_sync`] to chain +/// staged waits across all CBs in a submit. +/// +/// `begin_encoding` reads `next_release_value` and pre-encodes +/// `encodeWaitForEvent(event, expected)` at the start of each CB. +/// `submit` claims the value via `fetch_add`, encodes the foreign +/// waits + `encodeSignalEvent(event, claimed)` on a wait CB, and +/// commits it. `commit_lock` serializes the claim+commit pair so +/// concurrent submits land their signals in monotonic *commit* order +/// on the CPU. GPU-side execution of the resulting wait CBs may still +/// reorder under concurrency; see the comment in `submit` for why +/// that's harmless. +#[derive(Debug)] +struct Relay { + event: Retained>, + next_release_value: atomic::AtomicU64, + commit_lock: Mutex<()>, } #[derive(Debug)] @@ -480,6 +611,9 @@ pub struct QueueShared { // to create command buffers for internal purposes. In those cases we always // commit the buffer immediately, so we don't adjust the counter for them.) command_buffer_created_not_submitted: atomic::AtomicUsize, + pending_waits: PendingEvents, + pending_signals: PendingEvents, + relay: OnceLock, } pub struct Device { @@ -534,6 +668,55 @@ impl crate::Queue for Queue { (signal_fence, signal_value): (&mut Fence, crate::FenceValue), ) -> Result<(), crate::DeviceError> { autoreleasepool(|_| { + // Drain caller-staged waits onto a dedicated command buffer + // committed before the user CBs. + // + // When strict event sync is enabled, this CB also signals + // the relay event to release every user CB encoded since + // the previous submit (see `Queue::enable_strict_event_sync`). + // The `commit_lock` is held across `fetch_add` + `commit` so + // concurrent submits land their relay signals in monotonic + // *commit* order on the CPU side; otherwise a later-claimed + // signal could commit first and the subsequent backward + // signal would temporarily regress the relay's signaledValue. + // + // GPU-side ordering across independent wait CBs remains + // best-effort: Metal may run them in parallel, so a wait CB + // with foreign waits can fire its signal after a later + // submit's wait-free signal. CBs already released stay + // released (`MTLSharedEvent` waits are `>=`), and future + // submits' signals catch the value back up, so the regression + // is harmless - but users wanting strict GPU-side ordering + // across concurrent submits must serialize submits themselves. + // + // Without strict mode, we only emit a wait CB when there are + // pending waits - keeps the common-case submit overhead-free. + { + let relay = self.shared.relay.get(); + let mut waits = self.shared.pending_waits.lock(); + if relay.is_some() || !waits.is_empty() { + let _commit_guard = relay.map(|r| r.commit_lock.lock()); + // We do not bother adjusting `command_buffer_created_not_submitted` + // because we immediately commit this buffer. + let wait_cb = self + .shared + .raw + .commandBufferWithUnretainedReferences() + .unwrap(); + wait_cb.setLabel(Some(ns_string!("(wgpu internal) Wait"))); + for (event, value) in waits.drain(..) { + wait_cb.encodeWaitForEvent_value(event.as_ref(), value); + } + if let Some(relay) = relay { + let release = relay + .next_release_value + .fetch_add(1, atomic::Ordering::AcqRel); + wait_cb.encodeSignalEvent_value(relay.event.as_ref(), release); + } + wait_cb.commit(); + } + } + let extra_command_buffer = { let completed_value = Arc::clone(&signal_fence.completed_value); let block = block2::RcBlock::new(move |_cmd_buf| { @@ -562,6 +745,16 @@ impl crate::Queue for Queue { if let Some(shared_event) = &signal_fence.shared_event { raw.encodeSignalEvent_value(shared_event.as_ref(), signal_value); } + + // Drain caller-staged signals after our own signal so each + // additional event value publishes once the submit completes. + { + let mut signals = self.shared.pending_signals.lock(); + for (event, value) in signals.drain(..) { + raw.encodeSignalEvent_value(event.as_ref(), value); + } + } + // only return an extra one if it's extra match command_buffers.last() { Some(_) => None, From 6d662a85bcfe37f6cc78d5b142749247a040258a Mon Sep 17 00:00:00 2001 From: AdrianEddy Date: Sat, 2 May 2026 00:14:48 +0200 Subject: [PATCH 2/2] Fix formatting --- wgpu-hal/src/metal/adapter.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs index 34a159d899d..e3e24532ef6 100644 --- a/wgpu-hal/src/metal/adapter.rs +++ b/wgpu-hal/src/metal/adapter.rs @@ -11,8 +11,8 @@ use wgt::{AstcBlock, AstcChannel}; use alloc::{string::ToString as _, sync::Arc, vec::Vec}; use core::sync::atomic; -use std::sync::OnceLock; use parking_lot::Mutex; +use std::sync::OnceLock; use crate::metal::QueueShared;