Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ By @andyleiserson in [#9321](https://github.com/gfx-rs/wgpu/pull/9321).

#### Metal

- Add `metal::Queue::add_wait_event` / `add_signal_event` (with `remove_*` companions) to stage `MTLSharedEvent` waits/signals on the next `Queue::submit`, for GPU-side interop with foreign APIs. Waits run on an internal CB committed before user CBs. By @AdrianEddy in [#9483](https://github.com/gfx-rs/wgpu/pull/9483).
- Unconditionally enable `Features::CLIP_DISTANCES`. By @ErichDonGubler in [#9270](https://github.com/gfx-rs/wgpu/pull/9270).
- Added full support for mesh shaders, including in WGSL shaders. By @inner-daemons in [#8739](https://github.com/gfx-rs/wgpu/pull/8739).
- Fixed structure field names incorrectly ignoring reserved keywords in the Metal (MSL) backend. By @39ali [#9379](https://github.com/gfx-rs/wgpu/pull/9379).
Expand Down
5 changes: 5 additions & 0 deletions wgpu-hal/src/metal/adapter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ use wgt::{AstcBlock, AstcChannel};

use alloc::{string::ToString as _, sync::Arc, vec::Vec};
use core::sync::atomic;
use parking_lot::Mutex;
use std::sync::OnceLock;

use crate::metal::QueueShared;

Expand Down Expand Up @@ -124,6 +126,9 @@ impl crate::Adapter for super::Adapter {
shared: Arc::new(QueueShared {
raw: queue,
command_buffer_created_not_submitted: atomic::AtomicUsize::new(0),
pending_waits: Mutex::new(Vec::new()),
pending_signals: Mutex::new(Vec::new()),
relay: OnceLock::new(),
}),
timestamp_period,
},
Expand Down
8 changes: 8 additions & 0 deletions wgpu-hal/src/metal/command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,7 @@ impl crate::CommandEncoder for super::CommandEncoder {
unsafe fn begin_encoding(&mut self, label: crate::Label) -> Result<(), crate::DeviceError> {
let queue = &self.queue_shared.raw;
let retain_references = self.shared.settings.retain_command_buffer_references;
let relay = self.queue_shared.relay.get();

// Guard against exhausting Metal's command buffer budget. Use the hard
// limit (`MAX_COMMAND_BUFFERS`) so we fail before Metal can hang inside
Expand Down Expand Up @@ -487,6 +488,13 @@ impl crate::CommandEncoder for super::CommandEncoder {
if let Some(label) = label {
cmd_buf_ref.setLabel(Some(&NSString::from_str(label)));
}
// If strict event sync is enabled on this queue, gate the
// CB on the relay event at the value the next submit will
// signal. The CB pauses at the start until the relay fires.
if let Some(relay) = relay {
let expected = relay.next_release_value.load(atomic::Ordering::Acquire);
cmd_buf_ref.encodeWaitForEvent_value(relay.event.as_ref(), expected);
}
cmd_buf_ref.to_owned()
});

Expand Down
193 changes: 193 additions & 0 deletions wgpu-hal/src/metal/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ use alloc::{
vec::Vec,
};
use core::{fmt, iter, ops, ptr::NonNull, sync::atomic};
use std::sync::OnceLock;

use bitflags::bitflags;
use hashbrown::HashMap;
Expand Down Expand Up @@ -463,10 +464,140 @@ impl Queue {
shared: Arc::new(QueueShared {
raw,
command_buffer_created_not_submitted: atomic::AtomicUsize::new(0),
pending_waits: Mutex::new(Vec::new()),
pending_signals: Mutex::new(Vec::new()),
relay: OnceLock::new(),
}),
timestamp_period,
}
}

/// Returns the underlying `MTLCommandQueue` for direct interop with
/// other Metal-aware libraries.
pub fn as_raw(&self) -> &ProtocolObject<dyn MTLCommandQueue> {
&self.shared.raw
}

/// Enable strict GPU-side ordering for [`Self::add_wait_event`].
///
/// By default, `add_wait_event` encodes the wait on a separate
/// internal command buffer. Metal allows independent command
/// buffers in a queue to overlap on the GPU, so a wait CB does
/// not strictly gate subsequent user command buffers when those
/// CBs share no Metal-tracked resources with it. Single-stream
/// pipelines often serialize anyway because the GPU has no other
/// concurrent work to fill the slot, but mixed workloads (decode
/// + compute + render) can race.
///
/// When enabled, every [`crate::CommandEncoder::begin_encoding`]
/// pre-encodes a wait on an internal `MTLSharedEvent` at the start
/// of the new command buffer; every [`crate::Queue::submit`] then
/// signals that event after draining the staged external waits.
/// All command buffers since the previous submit are released in
/// lockstep once the foreign signals arrive, regardless of GPU
/// concurrency.
///
/// Costs one extra `encodeWaitForEvent` per command buffer plus
/// one extra internal command buffer per submit on this queue.
/// Other queues are unaffected.
///
/// Idempotent. Cannot be disabled - once enabled, the queue stays
/// in strict mode for its lifetime, since command buffers already
/// encoded would be stranded if the relay stopped firing.
pub fn enable_strict_event_sync(&self) -> Result<(), crate::DeviceError> {
if self.shared.relay.get().is_some() {
return Ok(());
}
let event = self
.shared
.raw
.device()
.newSharedEvent()
.ok_or(crate::DeviceError::OutOfMemory)?;
let _ = self.shared.relay.set(Relay {
event,
next_release_value: atomic::AtomicU64::new(1),
commit_lock: Mutex::new(()),
});
Ok(())
}

/// Stage an `MTLCommandBuffer::encodeWaitForEvent(event, value)` for
/// the next [`crate::Queue::submit`]. Lets external producers be waited
/// on without a CPU block.
///
/// By default the wait is encoded onto a dedicated internal command
/// buffer committed before the submit's user CBs - best-effort under
/// cross-CB GPU concurrency, see [`Self::enable_strict_event_sync`]
/// for strict gating. With strict mode enabled, the wait is chained
/// through an internal relay event that gates every user command
/// buffer encoded since the previous submit.
///
/// Staging is queue-wide, not per-thread or per-submit: any
/// `add_wait_event` call is consumed by whichever
/// [`crate::Queue::submit`] runs next on this queue. If you stage
/// events from multiple threads, coordinate the staging and the
/// submit yourself, or another thread's submit may drain your
/// pending waits.
pub fn add_wait_event(&self, event: Retained<ProtocolObject<dyn MTLSharedEvent>>, value: u64) {
self.shared.pending_waits.lock().push((event, value));
}

/// Remove `event` from the pending wait list if it is still present.
/// Returns `true` if it was found and removed.
pub fn remove_wait_event(&self, event: &ProtocolObject<dyn MTLSharedEvent>) -> bool {
let target: *const ProtocolObject<dyn MTLSharedEvent> = event;
let mut waits = self.shared.pending_waits.lock();
let before = waits.len();
waits.retain(|(e, _)| Retained::as_ptr(e) != target);
waits.len() != before
}

/// Stage an `MTLCommandBuffer::encodeSignalEvent(event, value)` for
/// the next [`crate::Queue::submit`]. The signal is encoded after
/// the submit's own completion signal, so a foreign API waiting on
/// `(event, value)` observes the wgpu work as done.
///
/// Staging is queue-wide, not per-thread or per-submit: see
/// [`Self::add_wait_event`] for the threading caveat.
pub fn add_signal_event(
&self,
event: Retained<ProtocolObject<dyn MTLSharedEvent>>,
value: u64,
) {
self.shared.pending_signals.lock().push((event, value));
}

/// Remove `event` from the pending signal list if it is still present.
/// Returns `true` if it was found and removed.
pub fn remove_signal_event(&self, event: &ProtocolObject<dyn MTLSharedEvent>) -> bool {
let target: *const ProtocolObject<dyn MTLSharedEvent> = event;
let mut signals = self.shared.pending_signals.lock();
let before = signals.len();
signals.retain(|(e, _)| Retained::as_ptr(e) != target);
signals.len() != before
}
}

type PendingEvents = Mutex<Vec<(Retained<ProtocolObject<dyn MTLSharedEvent>>, u64)>>;

/// Internal relay used by [`Queue::enable_strict_event_sync`] to chain
/// staged waits across all CBs in a submit.
///
/// `begin_encoding` reads `next_release_value` and pre-encodes
/// `encodeWaitForEvent(event, expected)` at the start of each CB.
/// `submit` claims the value via `fetch_add`, encodes the foreign
/// waits + `encodeSignalEvent(event, claimed)` on a wait CB, and
/// commits it. `commit_lock` serializes the claim+commit pair so
/// concurrent submits land their signals in monotonic *commit* order
/// on the CPU. GPU-side execution of the resulting wait CBs may still
/// reorder under concurrency; see the comment in `submit` for why
/// that's harmless.
#[derive(Debug)]
struct Relay {
event: Retained<ProtocolObject<dyn MTLSharedEvent>>,
next_release_value: atomic::AtomicU64,
commit_lock: Mutex<()>,
}

#[derive(Debug)]
Expand All @@ -480,6 +611,9 @@ pub struct QueueShared {
// to create command buffers for internal purposes. In those cases we always
// commit the buffer immediately, so we don't adjust the counter for them.)
command_buffer_created_not_submitted: atomic::AtomicUsize,
pending_waits: PendingEvents,
pending_signals: PendingEvents,
relay: OnceLock<Relay>,
}

pub struct Device {
Expand Down Expand Up @@ -534,6 +668,55 @@ impl crate::Queue for Queue {
(signal_fence, signal_value): (&mut Fence, crate::FenceValue),
) -> Result<(), crate::DeviceError> {
autoreleasepool(|_| {
// Drain caller-staged waits onto a dedicated command buffer
// committed before the user CBs.
//
// When strict event sync is enabled, this CB also signals
// the relay event to release every user CB encoded since
// the previous submit (see `Queue::enable_strict_event_sync`).
// The `commit_lock` is held across `fetch_add` + `commit` so
// concurrent submits land their relay signals in monotonic
// *commit* order on the CPU side; otherwise a later-claimed
// signal could commit first and the subsequent backward
// signal would temporarily regress the relay's signaledValue.
//
// GPU-side ordering across independent wait CBs remains
// best-effort: Metal may run them in parallel, so a wait CB
// with foreign waits can fire its signal after a later
// submit's wait-free signal. CBs already released stay
// released (`MTLSharedEvent` waits are `>=`), and future
// submits' signals catch the value back up, so the regression
// is harmless - but users wanting strict GPU-side ordering
// across concurrent submits must serialize submits themselves.
//
// Without strict mode, we only emit a wait CB when there are
// pending waits - keeps the common-case submit overhead-free.
{
let relay = self.shared.relay.get();
let mut waits = self.shared.pending_waits.lock();
if relay.is_some() || !waits.is_empty() {
let _commit_guard = relay.map(|r| r.commit_lock.lock());
// We do not bother adjusting `command_buffer_created_not_submitted`
// because we immediately commit this buffer.
let wait_cb = self
.shared
.raw
.commandBufferWithUnretainedReferences()
.unwrap();
wait_cb.setLabel(Some(ns_string!("(wgpu internal) Wait")));
for (event, value) in waits.drain(..) {
wait_cb.encodeWaitForEvent_value(event.as_ref(), value);
}
if let Some(relay) = relay {
let release = relay
.next_release_value
.fetch_add(1, atomic::Ordering::AcqRel);
wait_cb.encodeSignalEvent_value(relay.event.as_ref(), release);
}
wait_cb.commit();
}
}

let extra_command_buffer = {
let completed_value = Arc::clone(&signal_fence.completed_value);
let block = block2::RcBlock::new(move |_cmd_buf| {
Expand Down Expand Up @@ -562,6 +745,16 @@ impl crate::Queue for Queue {
if let Some(shared_event) = &signal_fence.shared_event {
raw.encodeSignalEvent_value(shared_event.as_ref(), signal_value);
}

// Drain caller-staged signals after our own signal so each
// additional event value publishes once the submit completes.
{
let mut signals = self.shared.pending_signals.lock();
for (event, value) in signals.drain(..) {
raw.encodeSignalEvent_value(event.as_ref(), value);
}
}

// only return an extra one if it's extra
match command_buffers.last() {
Some(_) => None,
Expand Down