diff --git a/CHANGELOG.md b/CHANGELOG.md
index 088a818edda..0ae654af548 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -160,6 +160,7 @@ By @andyleiserson in [#9321](https://github.com/gfx-rs/wgpu/pull/9321).
 
 - Fix `SYNC-HAZARD-WRITE-AFTER-PRESENT` on Vulkan when a surface texture is presented without being rendered to. By @inner-daemons and @atlv24 in [#9361](https://github.com/gfx-rs/wgpu/pull/9361).
 - Fix incorrect checks for dynamic binding bounds when calling an encoder's `set_bind_group` in passes and bundles. By @ErichDonGubler in [#9308](https://github.com/gfx-rs/wgpu/pull/9308).
+- Writes from `Queue::write_buffer` are now flushed by calls to `Buffer::map_async` for that same buffer, to prevent reading stale data. `on_submitted_work_done` also now flushes pending writes. By @andyleiserson in [#9307](https://github.com/gfx-rs/wgpu/pull/9307).
 
 #### naga
 
diff --git a/cts_runner/test.lst b/cts_runner/test.lst
index 8c3f0e5eff8..9104b25305c 100644
--- a/cts_runner/test.lst
+++ b/cts_runner/test.lst
@@ -12,6 +12,8 @@ unittests:*
 
 webgpu:api,operation,adapter,requestAdapter:*
 webgpu:api,operation,buffers,createBindGroup:buffer_binding_resource:*
+webgpu:api,operation,buffers,map:mapAsync,read,*
+webgpu:api,operation,buffers,map:mapAsync,write,*
 webgpu:api,operation,command_buffer,basic:*
 webgpu:api,operation,command_buffer,copyBufferToBuffer:*
 fails-if(vulkan) webgpu:api,operation,command_buffer,copyTextureToTexture:copy_depth_stencil:format="depth16unorm"
diff --git a/tests/tests/wgpu-gpu/buffer.rs b/tests/tests/wgpu-gpu/buffer.rs
index 598c4326025..c1c592d0251 100644
--- a/tests/tests/wgpu-gpu/buffer.rs
+++ b/tests/tests/wgpu-gpu/buffer.rs
@@ -6,6 +6,7 @@ pub fn all_tests(vec: &mut Vec<GpuTestInitializer>) {
     vec.extend([
         EMPTY_BUFFER,
         MAP_OFFSET,
+        MAP_WITHOUT_SUBMIT,
         MINIMUM_BUFFER_BINDING_SIZE_LAYOUT,
         MINIMUM_BUFFER_BINDING_SIZE_DISPATCH,
         CLEAR_OFFSET_OUTSIDE_RESOURCE_BOUNDS,
@@ -184,6 +185,48 @@ static MAP_OFFSET: GpuTestConfiguration = GpuTestConfiguration::new()
         }
     });
 
+/// Mapping a buffer should see data previously written to the buffer, even if there was no
+/// intervening submit.
+///
+/// Regression test for [#5173](https://github.com/gfx-rs/wgpu/issues/5173).
+#[gpu_test]
+static MAP_WITHOUT_SUBMIT: GpuTestConfiguration =
+    GpuTestConfiguration::new().run_async(|ctx| async move {
+        let buffer = ctx.device.create_buffer(&wgpu::BufferDescriptor {
+            label: None,
+            size: 12,
+            usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: true,
+        });
+
+        {
+            let data = (0..12).map(|i| (i % 255) as u8).collect::<Vec<u8>>();
+            let mut mapped = buffer.slice(0..12).get_mapped_range_mut().unwrap();
+            assert!(mapped.len() == 12);
+            mapped.copy_from_slice(&data);
+        }
+
+        buffer.unmap();
+
+        buffer
+            .slice(0..12)
+            .map_async(wgpu::MapMode::Read, Result::unwrap);
+
+        ctx.async_poll(wgpu::PollType::wait_indefinitely())
+            .await
+            .unwrap();
+
+        {
+            let mapped = buffer.slice(0..12).get_mapped_range().unwrap();
+            assert!(mapped.len() == 12);
+            for (i, elt) in mapped.iter().enumerate() {
+                assert_eq!(*elt, (i % 255) as u8);
+            }
+        }
+
+        buffer.unmap();
+    });
+
 /// The WebGPU algorithm [validating shader binding][vsb] requires
 /// implementations to check that buffer bindings are large enough to
 /// hold the WGSL `storage` or `uniform` variables they're bound to.
diff --git a/wgpu-core/src/device/mod.rs b/wgpu-core/src/device/mod.rs
index 864803aa107..ab3045b77d7 100644
--- a/wgpu-core/src/device/mod.rs
+++ b/wgpu-core/src/device/mod.rs
@@ -28,6 +28,7 @@ pub mod ray_tracing;
 pub mod resource;
 #[cfg(any(feature = "trace", feature = "replay"))]
 pub mod trace;
+pub(crate) use resource::{FenceReadGuard, FenceWriteGuard};
 pub use {life::WaitIdleError, resource::Device};
 
 pub const SHADER_STAGE_COUNT: usize = hal::MAX_CONCURRENT_SHADER_STAGES;
diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs
index 4d41081e832..602a1b107a0 100644
--- a/wgpu-core/src/device/queue.rs
+++ b/wgpu-core/src/device/queue.rs
@@ -24,7 +24,7 @@ use crate::{
         CommandAllocator, CommandBuffer, CommandEncoder, CommandEncoderError, CopySide,
         TransferError,
     },
-    device::{DeviceError, WaitIdleError},
+    device::{DeviceError, FenceReadGuard, FenceWriteGuard, WaitIdleError},
     get_lowest_common_denom,
     global::Global,
     hal_label,
@@ -125,8 +125,9 @@ impl Queue {
         &self,
         texture: &Arc<Texture>,
     ) -> Result<(), DeviceError> {
+        let snatch_guard = self.device.snatchable_lock.read();
+        let (mut submission, _index) = self.allocate_submission(snatch_guard);
         let device = &self.device;
-        let snatch_guard = device.snatchable_lock.read();
 
         // If the texture is uninitialized it needs to be cleared before presenting
         let needs_clear = {
@@ -137,8 +138,6 @@ impl Queue {
                 .is_some_and(|mip| mip.check(0..1).is_some())
         };
 
-        // Note required lock order: snatch -> fence -> pending writes
-        let mut fence = device.fence.write();
         let mut pending_writes = self.pending_writes.lock();
 
         if needs_clear {
@@ -154,7 +153,7 @@ impl Queue {
                 &mut trackers.textures,
                 &device.alignments,
                 device.zero_buffer.as_ref(),
-                &snatch_guard,
+                &submission.snatch_guard,
                 device.instance_flags,
             )
             .map_err(|e| match e {
@@ -189,7 +188,7 @@ impl Queue {
         // Emit the transition barriers to PRESENT.
         {
             let raw_texture = texture
-                .try_raw(&snatch_guard)
+                .try_raw(&submission.snatch_guard)
                 .map_err(|_| DeviceError::Lost)?;
             let barriers: Vec<hal::TextureBarrier<'_, dyn hal::DynTexture>> = pending
                 .into_iter()
@@ -210,23 +209,13 @@ impl Queue {
         pending_writes.insert_texture(texture);
 
         // Flush pending writes through the standard submission path.
-        let mut surface_textures = FastHashMap::default();
-        surface_textures.insert(Arc::as_ptr(texture), texture.clone());
+        submission
+            .surface_textures
+            .insert(Arc::as_ptr(texture), texture.clone());
 
-        let submit_index = {
-            let mut indices = device.command_indices.write();
-            indices.active_submission_index += 1;
-            indices.active_submission_index
-        };
+        submission.submit(pending_writes)?;
 
-        self.submit_with_pending_writes(
-            pending_writes,
-            Vec::new(),
-            surface_textures,
-            fence.as_mut(),
-            submit_index,
-            &snatch_guard,
-        )
+        Ok(())
     }
 
     pub(crate) fn maintain(
@@ -359,9 +348,13 @@ pub(crate) struct EncoderInFlight {
 ///
 /// Instead, `Device::pending_writes` owns one of these values, which
 /// has its own hal command encoder and resource lists. The commands
-/// accumulated here are automatically submitted to the queue the next
-/// time the user submits a wgpu command buffer, ahead of the user's
-/// commands.
+/// accumulated here are automatically submitted to the queue at the
+/// sooner of:
+///
+/// 1. The user's next wgpu command buffer submission. (Pending writes
+///    are inserted ahead of the user's commands.)
+/// 2. The next `mapAsync` request for a buffer that has pending
+///    writes.
 ///
 /// Important:
 /// When locking pending_writes be sure that tracker is not locked
@@ -541,8 +534,6 @@ pub enum QueueSubmitError {
     Queue(#[from] DeviceError),
     #[error(transparent)]
     DestroyedResource(#[from] DestroyedResourceError),
-    #[error(transparent)]
-    Unmap(#[from] BufferAccessError),
     #[error("{0} is still mapped")]
     BufferStillMapped(ResourceErrorIdent),
     #[error(transparent)]
@@ -557,7 +548,6 @@ impl WebGpuError for QueueSubmitError {
     fn webgpu_error_type(&self) -> ErrorType {
         match self {
             Self::Queue(e) => e.webgpu_error_type(),
-            Self::Unmap(e) => e.webgpu_error_type(),
             Self::CommandEncoder(e) => e.webgpu_error_type(),
             Self::ValidateAsActionsError(e) => e.webgpu_error_type(),
             Self::InvalidResource(e) => e.webgpu_error_type(),
@@ -566,6 +556,41 @@ impl WebGpuError for QueueSubmitError {
     }
 }
 
+/// A partially-assembled submission.
+///
+/// Returned from [`Queue::allocate_submission`] and consumed by [`submit`].
+/// These are internal APIs used in `Queue::submit` and other places within
+/// `wgpu-core` that need to submit work.
+///
+/// [`submit`]: `PendingSubmission::submit`
+pub(crate) struct PendingSubmission<'a> {
+    queue: &'a Queue,
+    snatch_guard: SnatchGuard<'a>,
+    fence: FenceWriteGuard<'a>,
+    command_index_guard: RwLockWriteGuard<'a, CommandIndices>,
+    // Command buffers to be executed, along with trackers for the resources they use.
+    pub executions: Vec<EncoderInFlight>,
+    // Surface textures referenced by command buffers in this submission. These need to be
+    // passed to the HAL `submit` call. Deduplicated using a hashmap to avoid vulkan
+    // deadlocking from the same surface texture being submitted multiple times.
+    pub surface_textures: FastHashMap<*const Texture, Arc<Texture>>,
+    pub index: SubmissionIndex,
+}
+
+pub(crate) struct SubmissionResult<'a> {
+    pub fence: FenceReadGuard<'a>,
+    pub snatch_guard: SnatchGuard<'a>,
+}
+
+impl<'a> PendingSubmission<'a> {
+    fn submit(
+        self,
+        pending_writes: MutexGuard<'a, PendingWrites>,
+    ) -> Result<SubmissionResult<'a>, DeviceError> {
+        self.queue.submit_pending_submission(pending_writes, self)
+    }
+}
+
 //TODO: move out common parts of write_xxx.
 
 impl Queue {
@@ -1222,6 +1247,36 @@ impl Queue {
         Ok(())
     }
 
+    /// Flush `PendingWrites` if it contains a write to `buffer`.
+    pub fn flush_writes_for_buffer(
+        &self,
+        buffer: &Arc<Buffer>,
+        snatch_guard: SnatchGuard,
+    ) -> Result<(), BufferAccessError> {
+        let (submission, _index) = self.allocate_submission(snatch_guard);
+
+        let pending_writes = self.pending_writes.lock();
+        if !pending_writes.contains_buffer(buffer) {
+            return Ok(());
+        }
+
+        submission.submit(pending_writes)?;
+
+        Ok(())
+    }
+
+    fn flush_pending_writes(&self) -> Result<Option<SubmissionIndex>, DeviceError> {
+        let snatch_guard = self.device.snatchable_lock.read();
+        let (submission, submit_index) = self.allocate_submission(snatch_guard);
+        let pending_writes = self.pending_writes.lock();
+        if pending_writes.is_recording {
+            submission.submit(pending_writes)?;
+            Ok(Some(submit_index))
+        } else {
+            Ok(None)
+        }
+    }
+
     #[cfg(feature = "trace")]
     fn trace_submission(
         &self,
@@ -1256,30 +1311,16 @@ impl Queue {
         profiling::scope!("Queue::submit");
         api_log!("Queue::submit");
 
-        let submit_index;
+        let snatch_guard = self.device.snatchable_lock.read();
+        let (mut submission, submit_index) = self.allocate_submission(snatch_guard);
 
         let res = 'error: {
-            let snatch_guard = self.device.snatchable_lock.read();
-
-            // Fence lock must be acquired after the snatch lock everywhere to avoid deadlocks.
-            let mut fence = self.device.fence.write();
-
-            let mut command_index_guard = self.device.command_indices.write();
-            command_index_guard.active_submission_index += 1;
-            submit_index = command_index_guard.active_submission_index;
-
             if let Err(e) = self.device.check_is_valid() {
                 break 'error Err(e.into());
             }
 
-            let mut active_executions = Vec::new();
-
             let mut used_surface_textures = track::TextureUsageScope::default();
 
-            // Use a hashmap here to deduplicate the surface textures that are used in the command buffers.
-            // This avoids vulkan deadlocking from the same surface texture being submitted multiple times.
-            let mut submit_surface_textures_owned = FastHashMap::default();
-
             {
                 if !command_buffers.is_empty() {
                     profiling::scope!("prepare");
@@ -1319,10 +1360,10 @@ impl Queue {
                                     command_buffer,
                                     self,
                                     &cmd_buf_data,
-                                    &snatch_guard,
-                                    &mut submit_surface_textures_owned,
+                                    &submission.snatch_guard,
+                                    &mut submission.surface_textures,
                                     &mut used_surface_textures,
-                                    &mut command_index_guard,
+                                    &mut submission.command_index_guard,
                                 );
                                 if let Err(err) = res {
                                     #[cfg(feature = "trace")]
@@ -1340,7 +1381,9 @@ impl Queue {
                                     self.trace_submission(submit_index, commands);
                                 }
 
-                                cmd_buf_data.set_acceleration_structure_dependencies(&snatch_guard);
+                                cmd_buf_data.set_acceleration_structure_dependencies(
+                                    &submission.snatch_guard,
+                                );
                                 cmd_buf_data.into_baked_commands()
                             }
                             Err(err) => {
@@ -1365,14 +1408,15 @@ impl Queue {
 
                         //Note: locking the trackers has to be done after the storages
                         let mut trackers = self.device.trackers.lock();
-                        if let Err(e) = baked.initialize_buffer_memory(&mut trackers, &snatch_guard)
+                        if let Err(e) =
+                            baked.initialize_buffer_memory(&mut trackers, &submission.snatch_guard)
                         {
                             break 'error Err(e.into());
                         }
                         if let Err(e) = baked.initialize_texture_memory(
                             &mut trackers,
                             &self.device,
-                            &snatch_guard,
+                            &submission.snatch_guard,
                         ) {
                             break 'error Err(e.into());
                         }
@@ -1383,7 +1427,7 @@ impl Queue {
                             baked.encoder.raw.as_mut(),
                             &mut trackers,
                             &baked.trackers,
-                            &snatch_guard,
+                            &submission.snatch_guard,
                         );
 
                         if let Err(e) = baked.encoder.close_and_push_front() {
@@ -1404,7 +1448,7 @@ impl Queue {
                                 .textures
                                 .set_from_usage_scope_and_drain_transitions(
                                     &used_surface_textures,
-                                    &snatch_guard,
+                                    &submission.snatch_guard,
                                 )
                                 .collect::<Vec<_>>();
                             unsafe {
@@ -1417,7 +1461,7 @@ impl Queue {
                         }
 
                         // done
-                        active_executions.push(EncoderInFlight {
+                        submission.executions.push(EncoderInFlight {
                             inner: baked.encoder,
                             trackers: baked.trackers,
                             temp_resources: baked.temp_resources,
@@ -1437,27 +1481,22 @@ impl Queue {
 
             let pending_writes = self.pending_writes.lock();
 
-            if let Err(e) = self.submit_with_pending_writes(
-                pending_writes,
-                active_executions,
-                submit_surface_textures_owned,
-                fence.as_mut(),
-                submit_index,
-                &snatch_guard,
-            ) {
-                break 'error Err(e.into());
-            }
-
-            drop(command_index_guard);
+            let SubmissionResult {
+                fence,
+                snatch_guard,
+            } = match submission.submit(pending_writes) {
+                Ok(result) => result,
+                Err(e) => break 'error Err(e.into()),
+            };
 
             profiling::scope!("cleanup");
 
             // This will schedule destruction of all resources that are no longer needed
             // by the user but used in the command stream, among other things.
-            let fence_guard = RwLockWriteGuard::downgrade(fence);
-            let (closures, result) =
-                self.device
-                    .maintain(fence_guard, wgt::PollType::Poll, snatch_guard);
+            // `device.maintain` consumes and will release the snatch guard.
+            let (closures, result) = self
+                .device
+                .maintain(fence, wgt::PollType::Poll, snatch_guard);
             match result {
                 Ok(status) => {
                     debug_assert!(matches!(
@@ -1490,22 +1529,88 @@ impl Queue {
         Ok(submit_index)
     }
 
-    /// Flush pending writes and any additional command encoders as a HAL submission.
+    /// Allocate a submission index and prepare for a submission.
+    ///
+    /// This is an internal API used in [`Queue::submit`] and other places within
+    /// `wgpu-core` that need to submit work.
+    ///
+    /// Returns the index and a [`PendingSubmission`].
+    ///
+    /// The caller passes in the already-acquired [`SnatchGuard`]. This function acquires
+    /// the fence lock and the command index lock.
+    ///
+    /// The caller should update the [`PendingSubmission`] members `executions` and
+    /// `surface_textures` with details of the submission.
+    ///
+    /// To finalize and submit the submission, call [`PendingSubmission::submit`] (which is
+    /// a convenience wrapper around [`Queue::submit_pending_submission`]).
+    ///
+    /// After calling this function and before submitting, the caller must acquire the
+    /// pending writes lock, and pass it to `submit`.
+    ///
+    /// It is also acceptable to drop the `PendingSubmission` without submitting. This may
+    /// be necessary when locks are required to access the state that determines whether a
+    /// submission is needed.
+    fn allocate_submission<'a>(
+        &'a self,
+        snatch_guard: SnatchGuard<'a>,
+    ) -> (PendingSubmission<'a>, SubmissionIndex) {
+        // Lock ordering requires that the fence lock be acquired after the snatch lock and
+        // before the command index lock.
+        let fence = self.device.fence.write();
+
+        let mut command_index_guard = self.device.command_indices.write();
+        command_index_guard.active_submission_index += 1;
+        let index = command_index_guard.active_submission_index;
+
+        let submission = PendingSubmission {
+            queue: self,
+            snatch_guard,
+            fence,
+            command_index_guard,
+            executions: Vec::new(),
+            surface_textures: FastHashMap::default(),
+            index,
+        };
+
+        (submission, index)
+    }
+
+    /// Finalize and submit a [`PendingSubmission`] that was returned by
+    /// [`Queue::allocate_submission`].
+    ///
+    /// This is an internal API used in `Queue::submit` and other places within
+    /// `wgpu-core` that need to submit work. See [`Queue::allocate_submission`]
+    /// for more details.
+    ///
+    /// This function:
     ///
-    /// Advances `last_successful_submission_index` and registers the submission with the lifetime tracker.
-    fn submit_with_pending_writes(
+    /// - Performs a HAL submission of the pending writes command
+    ///   encoder and any other command encoders that were added to the
+    ///   [`PendingSubmission`].
+    /// - Advances `last_successful_submission_index` and registers the
+    ///   submission with the lifetime tracker.
+    /// - Returns a [`SubmissionResult`], which contains the snatch guard
+    ///   and a downgraded [`FenceReadGuard`].
+    fn submit_pending_submission<'a>(
         &self,
         mut pending_writes: MutexGuard<'_, PendingWrites>,
-        mut active_executions: Vec<EncoderInFlight>,
-        mut surface_textures: FastHashMap<*const Texture, Arc<Texture>>,
-        fence: &mut dyn hal::DynFence,
-        submit_index: SubmissionIndex,
-        snatch_guard: &SnatchGuard,
-    ) -> Result<(), DeviceError> {
+        prepared: PendingSubmission<'a>,
+    ) -> Result<SubmissionResult<'a>, DeviceError> {
+        let PendingSubmission {
+            queue: _,
+            snatch_guard,
+            mut fence,
+            command_index_guard,
+            mut executions,
+            mut surface_textures,
+            index: submit_index,
+        } = prepared;
+
         let mut used_surface_textures = track::TextureUsageScope::default();
         used_surface_textures.set_size(self.device.tracker_indices.textures.size());
         for texture in pending_writes.dst_textures.values() {
-            match texture.try_inner(snatch_guard) {
+            match texture.try_inner(&snatch_guard) {
                 Ok(TextureInner::Native { .. }) => {}
                 Ok(TextureInner::Surface { .. }) => {
                     // Compare the Arcs by pointer as Textures don't implement Eq
@@ -1530,7 +1635,7 @@ impl Queue {
 
             let texture_barriers = trackers
                 .textures
-                .set_from_usage_scope_and_drain_transitions(&used_surface_textures, snatch_guard)
+                .set_from_usage_scope_and_drain_transitions(&used_surface_textures, &snatch_guard)
                 .collect::<Vec<_>>();
             unsafe {
                 pending_writes
@@ -1541,12 +1646,12 @@ impl Queue {
 
         match pending_writes.pre_submit(&self.device.command_allocator, &self.device, self) {
             Ok(Some(pending_execution)) => {
-                active_executions.insert(0, pending_execution);
+                executions.insert(0, pending_execution);
             }
             Ok(None) => {}
             Err(e) => return Err(e),
         }
-        let hal_command_buffers = active_executions
+        let hal_command_buffers = executions
             .iter()
             .flat_map(|e| e.inner.list.iter().map(|b| b.as_ref()))
             .collect::<Vec<_>>();
@@ -1555,7 +1660,7 @@ impl Queue {
             let mut submit_surface_textures =
                 SmallVec::<[&dyn hal::DynSurfaceTexture; 2]>::with_capacity(surface_textures.len());
             for texture in surface_textures.values() {
-                let raw = match texture.inner.get(snatch_guard) {
+                let raw = match texture.inner.get(&snatch_guard) {
                     Some(TextureInner::Surface { raw, .. }) => raw.as_ref(),
                     _ => unreachable!(),
                 };
@@ -1566,24 +1671,30 @@ impl Queue {
                 self.raw().submit(
                     &hal_command_buffers,
                     &submit_surface_textures,
-                    (fence, submit_index),
+                    (fence.as_mut(), submit_index),
                 )
             }
             .map_err(|e| self.device.handle_hal_error(e))?;
 
+            // Submissions must have strictly increasing indices, so we must hold the
+            // command index guard until we have submitted, to prevent another submission
+            // from claiming the next index and reaching `submit` before we do.
+            drop(pending_writes);
+            drop(command_index_guard);
+
             // Advance the successful submission index.
             self.device
                 .last_successful_submission_index
                 .fetch_max(submit_index, Ordering::SeqCst);
         }
 
-        drop(pending_writes);
-
         // this will register the new submission to the life time tracker
-        self.lock_life()
-            .track_submission(submit_index, active_executions);
+        self.lock_life().track_submission(submit_index, executions);
 
-        Ok(())
+        Ok(SubmissionResult {
+            fence: RwLockWriteGuard::downgrade(fence),
+            snatch_guard,
+        })
     }
 
     pub fn get_timestamp_period(&self) -> f32 {
@@ -1596,7 +1707,12 @@ impl Queue {
         closure: SubmittedWorkDoneClosure,
     ) -> Option<SubmissionIndex> {
         api_log!("Queue::on_submitted_work_done");
-        //TODO: flush pending writes
+
+        // A `DeviceError` means we're losing the device anyways, so we can ignore it here
+        // (mostly to avoid a breaking change to the `on_submitted_work_done` signature
+        // for an error case that it is unlikely the caller will be able to handle).
+        let _: Result<_, DeviceError> = self.flush_pending_writes();
+
         self.lock_life().add_work_done_closure(closure)
     }
 
@@ -1834,7 +1950,6 @@ impl Global {
     ) -> SubmissionIndex {
         api_log!("Queue::on_submitted_work_done {queue_id:?}");
 
-        //TODO: flush pending writes
         let queue = self.hub.queues.get(queue_id);
         let result = queue.on_submitted_work_done(closure);
         result.unwrap_or(0) // '0' means no wait is necessary
@@ -1895,7 +2010,7 @@ fn validate_command_buffer(
     queue: &Queue,
     cmd_buf_data: &crate::command::CommandBufferMutable,
     snatch_guard: &SnatchGuard,
-    submit_surface_textures_owned: &mut FastHashMap<*const Texture, Arc<Texture>>,
+    surface_textures: &mut FastHashMap<*const Texture, Arc<Texture>>,
     used_surface_textures: &mut track::TextureUsageScope,
     command_index_guard: &mut RwLockWriteGuard<CommandIndices>,
 ) -> Result<(), QueueSubmitError> {
@@ -1922,7 +2037,7 @@ fn validate_command_buffer(
                     TextureInner::Native { .. } => false,
                     TextureInner::Surface { .. } => {
                         // Compare the Arcs by pointer as Textures don't implement Eq.
-                        submit_surface_textures_owned.insert(Arc::as_ptr(texture), texture.clone());
+                        surface_textures.insert(Arc::as_ptr(texture), texture.clone());
 
                         true
                     }
diff --git a/wgpu-core/src/device/resource.rs b/wgpu-core/src/device/resource.rs
index b8809a41e7b..67f03bf9135 100644
--- a/wgpu-core/src/device/resource.rs
+++ b/wgpu-core/src/device/resource.rs
@@ -41,7 +41,7 @@ use crate::{
         TextureInitTrackerAction,
     },
     instance::{Adapter, RequestDeviceError},
-    lock::{rank, Mutex, RwLock},
+    lock::{rank, Mutex, RwLock, RwLockReadGuard, RwLockWriteGuard},
     pipeline::{self, ColorStateError},
     pool::ResourcePool,
     present,
@@ -288,6 +288,9 @@ pub struct Device {
     pub(crate) trace: Mutex<Option<Box<dyn trace::Trace + Send + Sync + 'static>>>,
 }
 
+pub(crate) type FenceReadGuard<'a> = RwLockReadGuard<'a, ManuallyDrop<Box<dyn hal::DynFence>>>;
+pub(crate) type FenceWriteGuard<'a> = RwLockWriteGuard<'a, ManuallyDrop<Box<dyn hal::DynFence>>>;
+
 pub(crate) enum DeferredDestroy {
     TextureViews(WeakVec<TextureView>),
     BindGroups(WeakVec<BindGroup>),
@@ -826,7 +829,7 @@ impl Device {
     ///   if there was a timeout or a validation error.
     pub(crate) fn maintain<'this>(
         &'this self,
-        fence: crate::lock::RwLockReadGuard<ManuallyDrop<Box<dyn hal::DynFence>>>,
+        fence: FenceReadGuard<'_>,
         poll_type: wgt::PollType<crate::SubmissionIndex>,
         snatch_guard: SnatchGuard,
     ) -> (UserClosures, Result<wgt::PollStatus, WaitIdleError>) {
diff --git a/wgpu-core/src/resource.rs b/wgpu-core/src/resource.rs
index 682c59b54a8..aa672dc2661 100644
--- a/wgpu-core/src/resource.rs
+++ b/wgpu-core/src/resource.rs
@@ -629,9 +629,8 @@ impl Buffer {
     ///   with the error, and the caller is responsible for calling the
     ///   callback.
     ///
-    /// A return value of `Ok(0)` roughly means that no wait is necessary,
-    /// but it does not necessarily mean that the buffer has already been
-    /// mapped.
+    /// A return value of `Ok(0)` means that mapping does not need to wait on the queue, but
+    /// it does not mean that the buffer has already been mapped.
     fn try_map_async(
         self: &Arc<Self>,
         offset: wgt::BufferAddress,
@@ -693,50 +692,79 @@ impl Buffer {
             return Err((op, e.into()));
         }
 
-        {
+        let submit_index = {
             let snatch_guard = device.snatchable_lock.read();
             if let Err(e) = self.check_destroyed(&snatch_guard) {
                 return Err((op, e.into()));
             }
-        }
 
-        {
-            let map_state = &mut *self.map_state.lock();
-            *map_state = match *map_state {
-                BufferMapState::Init { .. } | BufferMapState::Active { .. } => {
-                    return Err((op, BufferAccessError::AlreadyMapped));
-                }
-                BufferMapState::Waiting(_) => {
-                    return Err((op, BufferAccessError::MapAlreadyPending));
+            {
+                let map_state = &mut *self.map_state.lock();
+                *map_state = match *map_state {
+                    BufferMapState::Init { .. } | BufferMapState::Active { .. } => {
+                        return Err((op, BufferAccessError::AlreadyMapped));
+                    }
+                    BufferMapState::Waiting(_) => {
+                        return Err((op, BufferAccessError::MapAlreadyPending));
+                    }
+                    BufferMapState::Idle => BufferMapState::Waiting(BufferPendingMapping {
+                        range: offset..end_offset,
+                        op,
+                        _parent_buffer: self.clone(),
+                    }),
+                };
+            }
+
+            if let Some(queue) = device.get_queue().as_ref() {
+                match queue.flush_writes_for_buffer(self, snatch_guard) {
+                    Err(err) => {
+                        let state = mem::replace(&mut *self.map_state.lock(), BufferMapState::Idle);
+                        let BufferMapState::Waiting(BufferPendingMapping { op, .. }) = state else {
+                            unreachable!();
+                        };
+                        return Err((op, err));
+                    }
+                    Ok(()) => {
+                        // Schedule the buffer map in the  lifetime tracker.
+                        //
+                        // This call searches for use of the buffer by pending submissions.
+                        // If we just flushed pending writes, that search is redundant; we
+                        // already know that mapping needs to wait for the latest submission
+                        // and could implement a special case to directly attach it to that
+                        // submission. However, the queue is searched in reverse, so finding
+                        // that the buffer is used by the latest submission will be fast.
+                        Some(queue.lock_life().map(self).unwrap_or(0))
+                    }
                 }
-                BufferMapState::Idle => BufferMapState::Waiting(BufferPendingMapping {
-                    range: offset..end_offset,
-                    op,
-                    _parent_buffer: self.clone(),
-                }),
-            };
-        }
+            } else {
+                None
+            }
+        };
 
-        // TODO: we are ignoring the transition here, I think we need to add a barrier
-        // at the end of the submission
+        // At this point, `submit_index` is:
+        // - `Some(index)`, if there is a submission the mapping operation must wait for.
+        // - `Some(0)`, if we have a queue and there is no submission to wait for.
+        // - `None`, if we don't have a queue.
+        //
+        // TODO(https://github.com/gfx-rs/wgpu/issues/9306): we are ignoring the transition
+        // here, I think we need to add a barrier at the end of the submission
         device
             .trackers
             .lock()
             .buffers
             .set_single(self, internal_use);
 
-        let submit_index = if let Some(queue) = device.get_queue() {
-            queue.lock_life().map(self).unwrap_or(0) // '0' means no wait is necessary
+        if let Some(index) = submit_index {
+            Ok(index)
         } else {
+            // We don't have a queue, so go ahead and map the buffer.
             // We can safely unwrap below since we just set the `map_state` to `BufferMapState::Waiting`.
             let (mut operation, status) = self.map(&device.snatchable_lock.read()).unwrap();
             if let Some(callback) = operation.callback.take() {
                 callback(status);
             }
-            0
-        };
-
-        Ok(submit_index)
+            Ok(0)
+        }
     }
 
     pub fn get_mapped_range(
@@ -820,6 +848,7 @@ impl Buffer {
         }
     }
     /// This function returns [`None`] only if [`Self::map_state`] is not [`BufferMapState::Waiting`].
+    /// Other errors are returned within `BufferMapPendingClosure`.
     #[must_use]
     pub(crate) fn map(&self, snatch_guard: &SnatchGuard) -> Option<BufferMapPendingClosure> {
         // This _cannot_ be inlined into the match. If it is, the lock will be held