From 175d41561c8a9543fcd148a179a77d4828148690 Mon Sep 17 00:00:00 2001
From: AdrianEddy <adrian.eddy@gmail.com>
Date: Fri, 1 May 2026 23:57:21 +0200
Subject: [PATCH 1/2] [Metal/hal] Add `metal::Queue::add_wait_event` /
 `add_signal_event` for interop with external APIs

---
 CHANGELOG.md                  |   1 +
 wgpu-hal/src/metal/adapter.rs |   5 +
 wgpu-hal/src/metal/command.rs |   8 ++
 wgpu-hal/src/metal/mod.rs     | 193 ++++++++++++++++++++++++++++++++++
 4 files changed, 207 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c94b7aa225d..e5e223c300d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -99,6 +99,7 @@ By @andyleiserson in [#9321](https://github.com/gfx-rs/wgpu/pull/9321).
 
 #### Metal
 
+- Add `metal::Queue::add_wait_event` / `add_signal_event` (with `remove_*` companions) to stage `MTLSharedEvent` waits/signals on the next `Queue::submit`, for GPU-side interop with foreign APIs. Waits run on an internal CB committed before user CBs. By @AdrianEddy in [#9483](https://github.com/gfx-rs/wgpu/pull/9483).
 - Unconditionally enable `Features::CLIP_DISTANCES`. By @ErichDonGubler in [#9270](https://github.com/gfx-rs/wgpu/pull/9270).
 - Added full support for mesh shaders, including in WGSL shaders. By @inner-daemons in [#8739](https://github.com/gfx-rs/wgpu/pull/8739).
 - Fixed structure field names incorrectly ignoring reserved keywords in the Metal (MSL) backend. By @39ali [#9379](https://github.com/gfx-rs/wgpu/pull/9379).
diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs
index 2a6c2bd2d87..34a159d899d 100644
--- a/wgpu-hal/src/metal/adapter.rs
+++ b/wgpu-hal/src/metal/adapter.rs
@@ -11,6 +11,8 @@ use wgt::{AstcBlock, AstcChannel};
 
 use alloc::{string::ToString as _, sync::Arc, vec::Vec};
 use core::sync::atomic;
+use std::sync::OnceLock;
+use parking_lot::Mutex;
 
 use crate::metal::QueueShared;
 
@@ -124,6 +126,9 @@ impl crate::Adapter for super::Adapter {
                     shared: Arc::new(QueueShared {
                         raw: queue,
                         command_buffer_created_not_submitted: atomic::AtomicUsize::new(0),
+                        pending_waits: Mutex::new(Vec::new()),
+                        pending_signals: Mutex::new(Vec::new()),
+                        relay: OnceLock::new(),
                     }),
                     timestamp_period,
                 },
diff --git a/wgpu-hal/src/metal/command.rs b/wgpu-hal/src/metal/command.rs
index 9f2560df72f..447f07df1bd 100644
--- a/wgpu-hal/src/metal/command.rs
+++ b/wgpu-hal/src/metal/command.rs
@@ -457,6 +457,7 @@ impl crate::CommandEncoder for super::CommandEncoder {
     unsafe fn begin_encoding(&mut self, label: crate::Label) -> Result<(), crate::DeviceError> {
         let queue = &self.queue_shared.raw;
         let retain_references = self.shared.settings.retain_command_buffer_references;
+        let relay = self.queue_shared.relay.get();
 
         // Guard against exhausting Metal's command buffer budget. Use the hard
         // limit (`MAX_COMMAND_BUFFERS`) so we fail before Metal can hang inside
@@ -487,6 +488,13 @@ impl crate::CommandEncoder for super::CommandEncoder {
             if let Some(label) = label {
                 cmd_buf_ref.setLabel(Some(&NSString::from_str(label)));
             }
+            // If strict event sync is enabled on this queue, gate the
+            // CB on the relay event at the value the next submit will
+            // signal. The CB pauses at the start until the relay fires.
+            if let Some(relay) = relay {
+                let expected = relay.next_release_value.load(atomic::Ordering::Acquire);
+                cmd_buf_ref.encodeWaitForEvent_value(relay.event.as_ref(), expected);
+            }
             cmd_buf_ref.to_owned()
         });
 
diff --git a/wgpu-hal/src/metal/mod.rs b/wgpu-hal/src/metal/mod.rs
index d7edd7ce6e3..11f8aea5d4c 100644
--- a/wgpu-hal/src/metal/mod.rs
+++ b/wgpu-hal/src/metal/mod.rs
@@ -34,6 +34,7 @@ use alloc::{
     vec::Vec,
 };
 use core::{fmt, iter, ops, ptr::NonNull, sync::atomic};
+use std::sync::OnceLock;
 
 use bitflags::bitflags;
 use hashbrown::HashMap;
@@ -463,10 +464,140 @@ impl Queue {
             shared: Arc::new(QueueShared {
                 raw,
                 command_buffer_created_not_submitted: atomic::AtomicUsize::new(0),
+                pending_waits: Mutex::new(Vec::new()),
+                pending_signals: Mutex::new(Vec::new()),
+                relay: OnceLock::new(),
             }),
             timestamp_period,
         }
     }
+
+    /// Returns the underlying `MTLCommandQueue` for direct interop with
+    /// other Metal-aware libraries.
+    pub fn as_raw(&self) -> &ProtocolObject<dyn MTLCommandQueue> {
+        &self.shared.raw
+    }
+
+    /// Enable strict GPU-side ordering for [`Self::add_wait_event`].
+    ///
+    /// By default, `add_wait_event` encodes the wait on a separate
+    /// internal command buffer. Metal allows independent command
+    /// buffers in a queue to overlap on the GPU, so a wait CB does
+    /// not strictly gate subsequent user command buffers when those
+    /// CBs share no Metal-tracked resources with it. Single-stream
+    /// pipelines often serialize anyway because the GPU has no other
+    /// concurrent work to fill the slot, but mixed workloads (decode
+    /// + compute + render) can race.
+    ///
+    /// When enabled, every [`crate::CommandEncoder::begin_encoding`]
+    /// pre-encodes a wait on an internal `MTLSharedEvent` at the start
+    /// of the new command buffer; every [`crate::Queue::submit`] then
+    /// signals that event after draining the staged external waits.
+    /// All command buffers since the previous submit are released in
+    /// lockstep once the foreign signals arrive, regardless of GPU
+    /// concurrency.
+    ///
+    /// Costs one extra `encodeWaitForEvent` per command buffer plus
+    /// one extra internal command buffer per submit on this queue.
+    /// Other queues are unaffected.
+    ///
+    /// Idempotent. Cannot be disabled - once enabled, the queue stays
+    /// in strict mode for its lifetime, since command buffers already
+    /// encoded would be stranded if the relay stopped firing.
+    pub fn enable_strict_event_sync(&self) -> Result<(), crate::DeviceError> {
+        if self.shared.relay.get().is_some() {
+            return Ok(());
+        }
+        let event = self
+            .shared
+            .raw
+            .device()
+            .newSharedEvent()
+            .ok_or(crate::DeviceError::OutOfMemory)?;
+        let _ = self.shared.relay.set(Relay {
+            event,
+            next_release_value: atomic::AtomicU64::new(1),
+            commit_lock: Mutex::new(()),
+        });
+        Ok(())
+    }
+
+    /// Stage an `MTLCommandBuffer::encodeWaitForEvent(event, value)` for
+    /// the next [`crate::Queue::submit`]. Lets external producers be waited
+    /// on without a CPU block.
+    ///
+    /// By default the wait is encoded onto a dedicated internal command
+    /// buffer committed before the submit's user CBs - best-effort under
+    /// cross-CB GPU concurrency, see [`Self::enable_strict_event_sync`]
+    /// for strict gating. With strict mode enabled, the wait is chained
+    /// through an internal relay event that gates every user command
+    /// buffer encoded since the previous submit.
+    ///
+    /// Staging is queue-wide, not per-thread or per-submit: any
+    /// `add_wait_event` call is consumed by whichever
+    /// [`crate::Queue::submit`] runs next on this queue. If you stage
+    /// events from multiple threads, coordinate the staging and the
+    /// submit yourself, or another thread's submit may drain your
+    /// pending waits.
+    pub fn add_wait_event(&self, event: Retained<ProtocolObject<dyn MTLSharedEvent>>, value: u64) {
+        self.shared.pending_waits.lock().push((event, value));
+    }
+
+    /// Remove `event` from the pending wait list if it is still present.
+    /// Returns `true` if it was found and removed.
+    pub fn remove_wait_event(&self, event: &ProtocolObject<dyn MTLSharedEvent>) -> bool {
+        let target: *const ProtocolObject<dyn MTLSharedEvent> = event;
+        let mut waits = self.shared.pending_waits.lock();
+        let before = waits.len();
+        waits.retain(|(e, _)| Retained::as_ptr(e) != target);
+        waits.len() != before
+    }
+
+    /// Stage an `MTLCommandBuffer::encodeSignalEvent(event, value)` for
+    /// the next [`crate::Queue::submit`]. The signal is encoded after
+    /// the submit's own completion signal, so a foreign API waiting on
+    /// `(event, value)` observes the wgpu work as done.
+    ///
+    /// Staging is queue-wide, not per-thread or per-submit: see
+    /// [`Self::add_wait_event`] for the threading caveat.
+    pub fn add_signal_event(
+        &self,
+        event: Retained<ProtocolObject<dyn MTLSharedEvent>>,
+        value: u64,
+    ) {
+        self.shared.pending_signals.lock().push((event, value));
+    }
+
+    /// Remove `event` from the pending signal list if it is still present.
+    /// Returns `true` if it was found and removed.
+    pub fn remove_signal_event(&self, event: &ProtocolObject<dyn MTLSharedEvent>) -> bool {
+        let target: *const ProtocolObject<dyn MTLSharedEvent> = event;
+        let mut signals = self.shared.pending_signals.lock();
+        let before = signals.len();
+        signals.retain(|(e, _)| Retained::as_ptr(e) != target);
+        signals.len() != before
+    }
+}
+
+type PendingEvents = Mutex<Vec<(Retained<ProtocolObject<dyn MTLSharedEvent>>, u64)>>;
+
+/// Internal relay used by [`Queue::enable_strict_event_sync`] to chain
+/// staged waits across all CBs in a submit.
+///
+/// `begin_encoding` reads `next_release_value` and pre-encodes
+/// `encodeWaitForEvent(event, expected)` at the start of each CB.
+/// `submit` claims the value via `fetch_add`, encodes the foreign
+/// waits + `encodeSignalEvent(event, claimed)` on a wait CB, and
+/// commits it. `commit_lock` serializes the claim+commit pair so
+/// concurrent submits land their signals in monotonic *commit* order
+/// on the CPU. GPU-side execution of the resulting wait CBs may still
+/// reorder under concurrency; see the comment in `submit` for why
+/// that's harmless.
+#[derive(Debug)]
+struct Relay {
+    event: Retained<ProtocolObject<dyn MTLSharedEvent>>,
+    next_release_value: atomic::AtomicU64,
+    commit_lock: Mutex<()>,
 }
 
 #[derive(Debug)]
@@ -480,6 +611,9 @@ pub struct QueueShared {
     // to create command buffers for internal purposes. In those cases we always
     // commit the buffer immediately, so we don't adjust the counter for them.)
     command_buffer_created_not_submitted: atomic::AtomicUsize,
+    pending_waits: PendingEvents,
+    pending_signals: PendingEvents,
+    relay: OnceLock<Relay>,
 }
 
 pub struct Device {
@@ -534,6 +668,55 @@ impl crate::Queue for Queue {
         (signal_fence, signal_value): (&mut Fence, crate::FenceValue),
     ) -> Result<(), crate::DeviceError> {
         autoreleasepool(|_| {
+            // Drain caller-staged waits onto a dedicated command buffer
+            // committed before the user CBs.
+            //
+            // When strict event sync is enabled, this CB also signals
+            // the relay event to release every user CB encoded since
+            // the previous submit (see `Queue::enable_strict_event_sync`).
+            // The `commit_lock` is held across `fetch_add` + `commit` so
+            // concurrent submits land their relay signals in monotonic
+            // *commit* order on the CPU side; otherwise a later-claimed
+            // signal could commit first and the subsequent backward
+            // signal would temporarily regress the relay's signaledValue.
+            //
+            // GPU-side ordering across independent wait CBs remains
+            // best-effort: Metal may run them in parallel, so a wait CB
+            // with foreign waits can fire its signal after a later
+            // submit's wait-free signal. CBs already released stay
+            // released (`MTLSharedEvent` waits are `>=`), and future
+            // submits' signals catch the value back up, so the regression
+            // is harmless - but users wanting strict GPU-side ordering
+            // across concurrent submits must serialize submits themselves.
+            //
+            // Without strict mode, we only emit a wait CB when there are
+            // pending waits - keeps the common-case submit overhead-free.
+            {
+                let relay = self.shared.relay.get();
+                let mut waits = self.shared.pending_waits.lock();
+                if relay.is_some() || !waits.is_empty() {
+                    let _commit_guard = relay.map(|r| r.commit_lock.lock());
+                    // We do not bother adjusting `command_buffer_created_not_submitted`
+                    // because we immediately commit this buffer.
+                    let wait_cb = self
+                        .shared
+                        .raw
+                        .commandBufferWithUnretainedReferences()
+                        .unwrap();
+                    wait_cb.setLabel(Some(ns_string!("(wgpu internal) Wait")));
+                    for (event, value) in waits.drain(..) {
+                        wait_cb.encodeWaitForEvent_value(event.as_ref(), value);
+                    }
+                    if let Some(relay) = relay {
+                        let release = relay
+                            .next_release_value
+                            .fetch_add(1, atomic::Ordering::AcqRel);
+                        wait_cb.encodeSignalEvent_value(relay.event.as_ref(), release);
+                    }
+                    wait_cb.commit();
+                }
+            }
+
             let extra_command_buffer = {
                 let completed_value = Arc::clone(&signal_fence.completed_value);
                 let block = block2::RcBlock::new(move |_cmd_buf| {
@@ -562,6 +745,16 @@ impl crate::Queue for Queue {
                 if let Some(shared_event) = &signal_fence.shared_event {
                     raw.encodeSignalEvent_value(shared_event.as_ref(), signal_value);
                 }
+
+                // Drain caller-staged signals after our own signal so each
+                // additional event value publishes once the submit completes.
+                {
+                    let mut signals = self.shared.pending_signals.lock();
+                    for (event, value) in signals.drain(..) {
+                        raw.encodeSignalEvent_value(event.as_ref(), value);
+                    }
+                }
+
                 // only return an extra one if it's extra
                 match command_buffers.last() {
                     Some(_) => None,

From 6d662a85bcfe37f6cc78d5b142749247a040258a Mon Sep 17 00:00:00 2001
From: AdrianEddy <adrian.eddy@gmail.com>
Date: Sat, 2 May 2026 00:14:48 +0200
Subject: [PATCH 2/2] Fix formatting

---
 wgpu-hal/src/metal/adapter.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wgpu-hal/src/metal/adapter.rs b/wgpu-hal/src/metal/adapter.rs
index 34a159d899d..e3e24532ef6 100644
--- a/wgpu-hal/src/metal/adapter.rs
+++ b/wgpu-hal/src/metal/adapter.rs
@@ -11,8 +11,8 @@ use wgt::{AstcBlock, AstcChannel};
 
 use alloc::{string::ToString as _, sync::Arc, vec::Vec};
 use core::sync::atomic;
-use std::sync::OnceLock;
 use parking_lot::Mutex;
+use std::sync::OnceLock;
 
 use crate::metal::QueueShared;