diff --git a/CHANGELOG.md b/CHANGELOG.md
index 27e60624b32..76ae2c424ae 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -78,6 +78,7 @@ Bottom level categories:
 
 #### General
 
+- `SurfaceTexture::present()` has been replaced by `Queue::present(surface_texture)`. By @inner-daemons and @atlv24 in [#9361](https://github.com/gfx-rs/wgpu/pull/9361).
 - `Features::CLIP_DISTANCE`, `naga::Capabilities::CLIP_DISTANCE`, and `naga::BuiltIn::ClipDistance` have been renamed to `CLIP_DISTANCES` and `ClipDistances` (viz., pluralized) as appropriate, to match the WebGPU spec. By @ErichDonGubler in [#9267](https://github.com/gfx-rs/wgpu/pull/9267).
 - Added more granular limits for mesh shaders. By @inner-daemons in [#8739](https://github.com/gfx-rs/wgpu/pull/8739).
 - Added new `InvalidWorkgroupSizeError`, which is now used by `DrawError::InvalidGroupSize` and `StageError::InvalidWorkgroupSize`. By @andyleiserson in [#9357](https://github.com/gfx-rs/wgpu/pull/9357).
@@ -99,6 +100,7 @@ Bottom level categories:
 
 #### General
 
+- Fix `SYNC-HAZARD-WRITE-AFTER-PRESENT` on Vulkan when a surface texture is presented without being rendered to. By @inner-daemons and @atlv24 in [#9361](https://github.com/gfx-rs/wgpu/pull/9361).
 - Fix incorrect checks for dynamic binding bounds when calling an encoder's `set_bind_group` in passes and bundles. By @ErichDonGubler in [#9308](https://github.com/gfx-rs/wgpu/pull/9308).
 
 #### naga
diff --git a/Cargo.lock b/Cargo.lock
index c91689ee892..c3d03ce5ee6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4693,6 +4693,16 @@ dependencies = [
  "winit",
 ]
 
+[[package]]
+name = "wgpu-bug-repro-02-present-bugs"
+version = "0.0.0"
+dependencies = [
+ "env_logger",
+ "pollster",
+ "wgpu",
+ "winit",
+]
+
 [[package]]
 name = "wgpu-core"
 version = "29.0.0"
diff --git a/examples/bug-repro/01_texture_atomic_bug/src/main.rs b/examples/bug-repro/01_texture_atomic_bug/src/main.rs
index 3d93df4eea3..1f9263b4d0c 100644
--- a/examples/bug-repro/01_texture_atomic_bug/src/main.rs
+++ b/examples/bug-repro/01_texture_atomic_bug/src/main.rs
@@ -340,6 +340,6 @@ impl State {
         }
 
         self.queue.submit([enc.finish()]);
-        frame.present();
+        self.queue.present(frame);
     }
 }
diff --git a/examples/bug-repro/02_present_bugs/Cargo.toml b/examples/bug-repro/02_present_bugs/Cargo.toml
new file mode 100644
index 00000000000..72c9c50259c
--- /dev/null
+++ b/examples/bug-repro/02_present_bugs/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "wgpu-bug-repro-02-present-bugs"
+edition = "2021"
+rust-version = "1.87"
+publish = false
+
+[dependencies]
+env_logger = "0.11"
+pollster = "0.4"
+wgpu.workspace = true
+winit = "0.30.8"
diff --git a/examples/bug-repro/02_present_bugs/src/main.rs b/examples/bug-repro/02_present_bugs/src/main.rs
new file mode 100644
index 00000000000..191ce86ce5d
--- /dev/null
+++ b/examples/bug-repro/02_present_bugs/src/main.rs
@@ -0,0 +1,145 @@
+//! Repro for various issues with queue presentation
+//!
+//! The 2 current bugs being tested are presentation after no usage of surface texture
+//! and queue destruction immediately after present
+
+use std::sync::Arc;
+
+use winit::application::ApplicationHandler;
+use winit::event::WindowEvent;
+use winit::event_loop::{ActiveEventLoop, EventLoop};
+use winit::window::{Window, WindowId};
+
+fn main() {
+    env_logger::init();
+    let event_loop = EventLoop::new().unwrap();
+    event_loop.set_control_flow(winit::event_loop::ControlFlow::Poll);
+    let mut app = App::default();
+    event_loop.run_app(&mut app).unwrap();
+}
+
+#[derive(Default)]
+struct App {
+    state: Option<State>,
+}
+
+struct State {
+    window: Arc<Window>,
+    instance: wgpu::Instance,
+    device: wgpu::Device,
+    queue: Option<wgpu::Queue>,
+    surface: wgpu::Surface<'static>,
+    surface_config: wgpu::SurfaceConfiguration,
+}
+
+impl ApplicationHandler for App {
+    fn resumed(&mut self, event_loop: &ActiveEventLoop) {
+        if self.state.is_some() {
+            return;
+        }
+        let window = Arc::new(
+            event_loop
+                .create_window(Window::default_attributes().with_title("Presentation bugs"))
+                .unwrap(),
+        );
+        self.state = Some(State::new(window));
+    }
+
+    fn window_event(
+        &mut self,
+        event_loop: &ActiveEventLoop,
+        _window_id: WindowId,
+        event: WindowEvent,
+    ) {
+        let Some(state) = &mut self.state else { return };
+        match event {
+            WindowEvent::CloseRequested => event_loop.exit(),
+            WindowEvent::Resized(size) if size.width > 0 && size.height > 0 => {
+                state.surface_config.width = size.width;
+                state.surface_config.height = size.height;
+                state
+                    .surface
+                    .configure(&state.device, &state.surface_config);
+            }
+            WindowEvent::RedrawRequested => {
+                let frame = match state.surface.get_current_texture() {
+                    wgpu::CurrentSurfaceTexture::Success(f) => f,
+                    wgpu::CurrentSurfaceTexture::Suboptimal(_)
+                    | wgpu::CurrentSurfaceTexture::Outdated => {
+                        state
+                            .surface
+                            .configure(&state.device, &state.surface_config);
+                        return;
+                    }
+                    wgpu::CurrentSurfaceTexture::Lost => {
+                        state.surface =
+                            state.instance.create_surface(state.window.clone()).unwrap();
+                        state
+                            .surface
+                            .configure(&state.device, &state.surface_config);
+                        return;
+                    }
+                    _ => return,
+                };
+                let Some(queue) = state.queue.take() else {
+                    return;
+                };
+                // Immediately present the surface texture (with nothing on it) and then drop the queue, which should cause a full wait.
+                queue.present(frame);
+                event_loop.exit();
+            }
+            _ => {}
+        }
+    }
+
+    fn about_to_wait(&mut self, _event_loop: &ActiveEventLoop) {
+        if let Some(state) = &self.state {
+            state.window.request_redraw();
+        }
+    }
+}
+
+impl State {
+    fn new(window: Arc<Window>) -> Self {
+        let size = window.inner_size();
+        let width = size.width.max(1);
+        let height = size.height.max(1);
+
+        let mut instance_desc = wgpu::InstanceDescriptor::new_without_display_handle_from_env();
+        instance_desc.flags |= wgpu::InstanceFlags::advanced_debugging();
+        let instance = wgpu::Instance::new(instance_desc);
+        let surface = instance.create_surface(window.clone()).unwrap();
+        let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions {
+            compatible_surface: Some(&surface),
+            ..Default::default()
+        }))
+        .expect("No adapter");
+
+        println!("Adapter: {:?}", adapter.get_info().name);
+
+        let (device, queue) =
+            pollster::block_on(adapter.request_device(&Default::default())).unwrap();
+
+        let surface_format = surface.get_capabilities(&adapter).formats[0];
+        let surface_config = wgpu::SurfaceConfiguration {
+            usage: wgpu::TextureUsages::RENDER_ATTACHMENT,
+            format: surface_format,
+            width,
+            height,
+            present_mode: wgpu::PresentMode::AutoVsync,
+            alpha_mode: wgpu::CompositeAlphaMode::Auto,
+            view_formats: vec![],
+            desired_maximum_frame_latency: 2,
+        };
+        surface.configure(&device, &surface_config);
+
+        State {
+            window,
+            instance,
+            device,
+            queue: Some(queue),
+            surface,
+            surface_config,
+        }
+    }
+}
diff --git a/examples/features/src/framework.rs b/examples/features/src/framework.rs
index a3c662e7516..27b9a0118b4 100644
--- a/examples/features/src/framework.rs
+++ b/examples/features/src/framework.rs
@@ -557,7 +557,7 @@ impl<E: Example> ApplicationHandler<AppAction> for App<E> {
                     if let Some(window) = &self.window {
                         window.pre_present_notify();
                     }
-                    frame.present();
+                    context.queue.present(frame);
                 }
 
                 if let Some(window) = &self.window {
diff --git a/examples/features/src/hello_triangle/mod.rs b/examples/features/src/hello_triangle/mod.rs
index 71cf19e6aeb..6d6c5bb85d8 100644
--- a/examples/features/src/hello_triangle/mod.rs
+++ b/examples/features/src/hello_triangle/mod.rs
@@ -300,7 +300,7 @@ impl ApplicationHandler<TriangleAction> for App {
                 if let Some(window) = &self.window {
                     window.pre_present_notify();
                 }
-                frame.present();
+                wgpu_state.queue.present(frame);
             }
             WindowEvent::Occluded(is_occluded) => {
                 if !is_occluded {
diff --git a/examples/features/src/hello_windows/mod.rs b/examples/features/src/hello_windows/mod.rs
index 825b3e3dd90..9d8c5f4cf6b 100644
--- a/examples/features/src/hello_windows/mod.rs
+++ b/examples/features/src/hello_windows/mod.rs
@@ -255,7 +255,7 @@ impl ApplicationHandler for App {
 
                     queue.submit(Some(encoder.finish()));
                     viewport.desc.window.pre_present_notify();
-                    frame.present();
+                    queue.present(frame);
                 }
             }
             WindowEvent::Occluded(is_occluded) => {
diff --git a/examples/features/src/uniform_values/mod.rs b/examples/features/src/uniform_values/mod.rs
index 672468deb1f..d9d5def4d0f 100644
--- a/examples/features/src/uniform_values/mod.rs
+++ b/examples/features/src/uniform_values/mod.rs
@@ -481,7 +481,7 @@ impl ApplicationHandler<UniformAction> for App {
                 if let Some(window) = &self.window {
                     window.pre_present_notify();
                 }
-                frame.present();
+                wgpu_ctx.queue.present(frame);
             }
             WindowEvent::Occluded(is_occluded) => {
                 if !is_occluded {
diff --git a/examples/standalone/02_hello_window/src/main.rs b/examples/standalone/02_hello_window/src/main.rs
index 1b5f6db43ca..a0326a4314b 100644
--- a/examples/standalone/02_hello_window/src/main.rs
+++ b/examples/standalone/02_hello_window/src/main.rs
@@ -141,7 +141,7 @@ impl State {
         // Submit the command in the queue to execute
         self.queue.submit([encoder.finish()]);
         self.window.pre_present_notify();
-        surface_texture.present();
+        self.queue.present(surface_texture);
     }
 }
 
diff --git a/examples/standalone/custom_backend/src/custom.rs b/examples/standalone/custom_backend/src/custom.rs
index 676158e83fb..a643ee8bd4b 100644
--- a/examples/standalone/custom_backend/src/custom.rs
+++ b/examples/standalone/custom_backend/src/custom.rs
@@ -380,6 +380,10 @@ impl QueueInterface for CustomQueue {
     fn compact_blas(&self, _blas: &DispatchBlas) -> (Option<u64>, DispatchBlas) {
         unimplemented!()
     }
+
+    fn present(&self, _detail: &wgpu::custom::DispatchSurfaceOutputDetail) {
+        unimplemented!()
+    }
 }
 
 #[derive(Debug)]
diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs
index 9aed35e9704..a66ff71bea7 100644
--- a/wgpu-core/src/device/queue.rs
+++ b/wgpu-core/src/device/queue.rs
@@ -117,6 +117,119 @@ impl Queue {
         self.life_tracker.lock()
     }
 
+    /// Ensure the surface texture is in the PRESENT state, clearing it if it was never rendered to.
+    /// Submits any necessary work to the GPU before the HAL present call.
+    ///
+    /// See <https://github.com/gfx-rs/wgpu/issues/6748>
+    pub(crate) fn prepare_surface_texture_for_present(
+        &self,
+        texture: &Arc<Texture>,
+    ) -> Result<(), DeviceError> {
+        let device = &self.device;
+        let snatch_guard = device.snatchable_lock.read();
+
+        // If the texture is uninitialized it needs to be cleared before presenting
+        let needs_clear = {
+            let status = texture.initialization_status.read();
+            status
+                .mips
+                .first()
+                .is_some_and(|mip| mip.check(0..1).is_some())
+        };
+
+        // Fence lock must be acquired after the snatch lock and before
+        // pending_writes to match the lock ordering in Queue::submit.
+        let mut fence = device.fence.write();
+        let mut pending_writes = self.pending_writes.lock();
+
+        if needs_clear {
+            let encoder = pending_writes.activate();
+            let mut trackers = device.trackers.lock();
+            crate::command::clear_texture(
+                texture,
+                TextureInitRange {
+                    mip_range: 0..1,
+                    layer_range: 0..1,
+                },
+                encoder,
+                &mut trackers.textures,
+                &device.alignments,
+                device.zero_buffer.as_ref(),
+                &snatch_guard,
+                device.instance_flags,
+            )
+            .map_err(|e| match e {
+                ClearError::Device(e) => e,
+                _ => DeviceError::Lost,
+            })?;
+            texture.initialization_status.write().mips[0].drain(0..1);
+        }
+
+        // Transition the texture to PRESENT in the device tracker.
+        // If it's already in PRESENT, this produces no barriers and we can skip the submission.
+        //
+        // This has to be after any clear_texture call because clear_texture modifies the tracker state internally.
+        // Computing transitions afterward ensures they reflect the actual current state.
+        let pending = {
+            let mut trackers = device.trackers.lock();
+            let pending: Vec<track::PendingTransition<wgt::TextureUses>> = trackers
+                .textures
+                .set_single(
+                    texture,
+                    texture.full_range.clone(),
+                    wgt::TextureUses::PRESENT,
+                )
+                .collect();
+            pending
+        };
+
+        if pending.is_empty() {
+            return Ok(());
+        }
+
+        // Emit the transition barriers to PRESENT.
+        {
+            let raw_texture = texture
+                .try_raw(&snatch_guard)
+                .map_err(|_| DeviceError::Lost)?;
+            let barriers: Vec<hal::TextureBarrier<'_, dyn hal::DynTexture>> = pending
+                .into_iter()
+                .map(|pt| pt.into_hal(raw_texture))
+                .collect();
+
+            let encoder = pending_writes.activate();
+            // SAFETY:
+            // - The encoder is in the recording state after `activate()`
+            // - The texture is kept alive by the Arc from `acquired_texture`
+            unsafe {
+                encoder.transition_textures(&barriers);
+            }
+        }
+
+        // Keep the texture alive in the submission so its clear_view isn't
+        // destroyed before the GPU finishes the submitted commands.
+        pending_writes.insert_texture(texture);
+
+        // Flush pending writes through the standard submission path.
+        let mut surface_textures = FastHashMap::default();
+        surface_textures.insert(Arc::as_ptr(texture), texture.clone());
+
+        let submit_index = {
+            let mut indices = device.command_indices.write();
+            indices.active_submission_index += 1;
+            indices.active_submission_index
+        };
+
+        self.submit_with_pending_writes(
+            &mut pending_writes,
+            Vec::new(),
+            surface_textures,
+            fence.as_mut(),
+            submit_index,
+            &snatch_guard,
+        )
+    }
+
     pub(crate) fn maintain(
         &self,
         submission_index: u64,
@@ -158,81 +271,23 @@ impl Drop for Queue {
     fn drop(&mut self) {
         resource_log!("Drop {}", self.error_ident());
 
+        // On Vulkan, pending presents are not tracked by fences.
+        // wait_for_idle covers both fence-tracked submissions and pending presents.
+        match unsafe { self.raw.wait_for_idle() } {
+            Ok(()) => {}
+            Err(hal::DeviceError::Lost) => {
+                self.device.handle_hal_error(hal::DeviceError::Lost);
+            }
+            Err(e) => {
+                panic!("Unexpected error while waiting for queue idle on drop: {e:?}");
+            }
+        }
+
         let last_successful_submission_index = self
             .device
             .last_successful_submission_index
             .load(Ordering::Acquire);
 
-        let fence = self.device.fence.read();
-
-        // Try waiting on the last submission using the following sequence of timeouts
-        let timeouts_in_ms = [100, 200, 400, 800, 1600, 3200];
-
-        for (i, timeout_ms) in timeouts_in_ms.into_iter().enumerate() {
-            let is_last_iter = i == timeouts_in_ms.len() - 1;
-
-            api_log!(
-                "Waiting on last submission. try: {}/{}. timeout: {}ms",
-                i + 1,
-                timeouts_in_ms.len(),
-                timeout_ms
-            );
-
-            let wait_res = unsafe {
-                self.device.raw().wait(
-                    fence.as_ref(),
-                    last_successful_submission_index,
-                    #[cfg(not(target_arch = "wasm32"))]
-                    Some(core::time::Duration::from_millis(timeout_ms)),
-                    #[cfg(target_arch = "wasm32")]
-                    Some(core::time::Duration::ZERO), // WebKit and Chromium don't support a non-0 timeout
-                )
-            };
-            // Note: If we don't panic below we are in UB land (destroying resources while they are still in use by the GPU).
-            match wait_res {
-                Ok(true) => break,
-                Ok(false) => {
-                    // It's fine that we timed out on WebGL; GL objects can be deleted early as they
-                    // will be kept around by the driver if GPU work hasn't finished.
-                    // Moreover, the way we emulate read mappings on WebGL allows us to execute map_buffer earlier than on other
-                    // backends since getBufferSubData is synchronous with respect to the other previously enqueued GL commands.
-                    // Relying on this behavior breaks the clean abstraction wgpu-hal tries to maintain and
-                    // we should find ways to improve this. See https://github.com/gfx-rs/wgpu/issues/6538.
-                    #[cfg(target_arch = "wasm32")]
-                    {
-                        break;
-                    }
-                    #[cfg(not(target_arch = "wasm32"))]
-                    {
-                        if is_last_iter {
-                            panic!(
-                                "We timed out while waiting on the last successful submission to complete!"
-                            );
-                        }
-                    }
-                }
-                Err(e) => match e {
-                    hal::DeviceError::OutOfMemory => {
-                        if is_last_iter {
-                            panic!(
-                                "We ran into an OOM error while waiting on the last successful submission to complete!"
-                            );
-                        }
-                    }
-                    hal::DeviceError::Lost => {
-                        self.device.handle_hal_error(e); // will lose the device
-                        break;
-                    }
-                    hal::DeviceError::Unexpected => {
-                        panic!(
-                            "We ran into an unexpected error while waiting on the last successful submission to complete!"
-                        );
-                    }
-                },
-            }
-        }
-        drop(fence);
-
         let snatch_guard = self.device.snatchable_lock.read();
         let (submission_closures, mapping_closures, blas_compact_ready_closures, queue_empty) =
             self.maintain(last_successful_submission_index, &snatch_guard);
@@ -1383,101 +1438,23 @@ impl Queue {
 
             let mut pending_writes = self.pending_writes.lock();
 
-            {
-                used_surface_textures.set_size(self.device.tracker_indices.textures.size());
-                for texture in pending_writes.dst_textures.values() {
-                    match texture.try_inner(&snatch_guard) {
-                        Ok(TextureInner::Native { .. }) => {}
-                        Ok(TextureInner::Surface { .. }) => {
-                            // Compare the Arcs by pointer as Textures don't implement Eq
-                            submit_surface_textures_owned
-                                .insert(Arc::as_ptr(texture), texture.clone());
-
-                            unsafe {
-                                used_surface_textures
-                                    .merge_single(texture, None, wgt::TextureUses::PRESENT)
-                                    .unwrap()
-                            };
-                        }
-                        // The texture must not have been destroyed when its usage here was
-                        // encoded. If it was destroyed after that, then it was transferred
-                        // to `pending_writes.temp_resources` at the time of destruction, so
-                        // we are still okay to use it.
-                        Err(DestroyedResourceError(_)) => {}
-                    }
-                }
-
-                if !used_surface_textures.is_empty() {
-                    let mut trackers = self.device.trackers.lock();
-
-                    let texture_barriers = trackers
-                        .textures
-                        .set_from_usage_scope_and_drain_transitions(
-                            &used_surface_textures,
-                            &snatch_guard,
-                        )
-                        .collect::<Vec<_>>();
-                    unsafe {
-                        pending_writes
-                            .command_encoder
-                            .transition_textures(&texture_barriers);
-                    };
-                }
-            }
-
-            match pending_writes.pre_submit(&self.device.command_allocator, &self.device, self) {
-                Ok(Some(pending_execution)) => {
-                    active_executions.insert(0, pending_execution);
-                }
-                Ok(None) => {}
-                Err(e) => break 'error Err(e.into()),
+            if let Err(e) = self.submit_with_pending_writes(
+                &mut pending_writes,
+                active_executions,
+                submit_surface_textures_owned,
+                fence.as_mut(),
+                submit_index,
+                &snatch_guard,
+            ) {
+                break 'error Err(e.into());
             }
-            let hal_command_buffers = active_executions
-                .iter()
-                .flat_map(|e| e.inner.list.iter().map(|b| b.as_ref()))
-                .collect::<Vec<_>>();
 
-            {
-                let mut submit_surface_textures =
-                    SmallVec::<[&dyn hal::DynSurfaceTexture; 2]>::with_capacity(
-                        submit_surface_textures_owned.len(),
-                    );
-
-                for texture in submit_surface_textures_owned.values() {
-                    let raw = match texture.inner.get(&snatch_guard) {
-                        Some(TextureInner::Surface { raw, .. }) => raw.as_ref(),
-                        _ => unreachable!(),
-                    };
-                    submit_surface_textures.push(raw);
-                }
+            drop(command_index_guard);
 
-                if let Err(e) = unsafe {
-                    self.raw().submit(
-                        &hal_command_buffers,
-                        &submit_surface_textures,
-                        (fence.as_mut(), submit_index),
-                    )
-                }
-                .map_err(|e| self.device.handle_hal_error(e))
-                {
-                    break 'error Err(e.into());
-                }
-
-                drop(command_index_guard);
-
-                // Advance the successful submission index.
-                self.device
-                    .last_successful_submission_index
-                    .fetch_max(submit_index, Ordering::SeqCst);
-            }
+            drop(pending_writes);
 
             profiling::scope!("cleanup");
 
-            // this will register the new submission to the life time tracker
-            self.lock_life()
-                .track_submission(submit_index, active_executions);
-            drop(pending_writes);
-
             // This will schedule destruction of all resources that are no longer needed
             // by the user but used in the command stream, among other things.
             let fence_guard = RwLockWriteGuard::downgrade(fence);
@@ -1516,6 +1493,99 @@ impl Queue {
         Ok(submit_index)
     }
 
+    /// Flush pending writes and any additional command encoders as a HAL submission.
+    ///
+    /// Advances `last_successful_submission_index` and registers the submission with the lifetime tracker.
+    fn submit_with_pending_writes(
+        &self,
+        pending_writes: &mut PendingWrites,
+        mut active_executions: Vec<EncoderInFlight>,
+        mut surface_textures: FastHashMap<*const Texture, Arc<Texture>>,
+        fence: &mut dyn hal::DynFence,
+        submit_index: SubmissionIndex,
+        snatch_guard: &SnatchGuard,
+    ) -> Result<(), DeviceError> {
+        let mut used_surface_textures = track::TextureUsageScope::default();
+        used_surface_textures.set_size(self.device.tracker_indices.textures.size());
+        for texture in pending_writes.dst_textures.values() {
+            match texture.try_inner(snatch_guard) {
+                Ok(TextureInner::Native { .. }) => {}
+                Ok(TextureInner::Surface { .. }) => {
+                    // Compare the Arcs by pointer as Textures don't implement Eq
+                    surface_textures.insert(Arc::as_ptr(texture), texture.clone());
+
+                    unsafe {
+                        used_surface_textures
+                            .merge_single(texture, None, wgt::TextureUses::PRESENT)
+                            .unwrap()
+                    };
+                }
+                // The texture must not have been destroyed when its usage here was
+                // encoded. If it was destroyed after that, then it was transferred
+                // to `pending_writes.temp_resources` at the time of destruction, so
+                // we are still okay to use it.
+                Err(DestroyedResourceError(_)) => {}
+            }
+        }
+
+        if !used_surface_textures.is_empty() {
+            let mut trackers = self.device.trackers.lock();
+
+            let texture_barriers = trackers
+                .textures
+                .set_from_usage_scope_and_drain_transitions(&used_surface_textures, snatch_guard)
+                .collect::<Vec<_>>();
+            unsafe {
+                pending_writes
+                    .command_encoder
+                    .transition_textures(&texture_barriers);
+            };
+        }
+
+        match pending_writes.pre_submit(&self.device.command_allocator, &self.device, self) {
+            Ok(Some(pending_execution)) => {
+                active_executions.insert(0, pending_execution);
+            }
+            Ok(None) => {}
+            Err(e) => return Err(e),
+        }
+        let hal_command_buffers = active_executions
+            .iter()
+            .flat_map(|e| e.inner.list.iter().map(|b| b.as_ref()))
+            .collect::<Vec<_>>();
+
+        {
+            let mut submit_surface_textures =
+                SmallVec::<[&dyn hal::DynSurfaceTexture; 2]>::with_capacity(surface_textures.len());
+            for texture in surface_textures.values() {
+                let raw = match texture.inner.get(snatch_guard) {
+                    Some(TextureInner::Surface { raw, .. }) => raw.as_ref(),
+                    _ => unreachable!(),
+                };
+                submit_surface_textures.push(raw);
+            }
+
+            unsafe {
+                self.raw().submit(
+                    &hal_command_buffers,
+                    &submit_surface_textures,
+                    (fence, submit_index),
+                )
+            }
+            .map_err(|e| self.device.handle_hal_error(e))?;
+
+            // Advance the successful submission index.
+            self.device
+                .last_successful_submission_index
+                .fetch_max(submit_index, Ordering::SeqCst);
+        }
+        // this will register the new submission to the life time tracker
+        self.lock_life()
+            .track_submission(submit_index, active_executions);
+
+        Ok(())
+    }
+
     pub fn get_timestamp_period(&self) -> f32 {
         unsafe { self.raw().get_timestamp_period() }
     }
diff --git a/wgpu-core/src/present.rs b/wgpu-core/src/present.rs
index 86e7f453f18..e010e687fad 100644
--- a/wgpu-core/src/present.rs
+++ b/wgpu-core/src/present.rs
@@ -9,18 +9,18 @@ When this texture is presented, we remove it from the device tracker as well as
 extract it from the hub.
 !*/
 
-use alloc::{sync::Arc, vec::Vec};
+use alloc::{boxed::Box, sync::Arc, vec::Vec};
 use core::mem::ManuallyDrop;
 
 #[cfg(feature = "trace")]
 use crate::device::trace::{Action, IntoTrace};
 use crate::{
     conv,
-    device::{Device, DeviceError, MissingDownlevelFlags, WaitIdleError},
+    device::{queue::Queue, Device, DeviceError, MissingDownlevelFlags, WaitIdleError},
     global::Global,
     hal_label, id,
     instance::Surface,
-    resource,
+    resource::{self, Labeled},
 };
 
 use thiserror::Error;
@@ -49,6 +49,8 @@ pub enum SurfaceError {
     Device(#[from] DeviceError),
     #[error("Surface image is already acquired")]
     AlreadyAcquired,
+    #[error("No surface image is currently acquired to present")]
+    NothingToPresent,
     #[error("Texture has been destroyed")]
     TextureDestroyed,
 }
@@ -60,6 +62,7 @@ impl WebGpuError for SurfaceError {
             Self::Invalid
             | Self::NotConfigured
             | Self::AlreadyAcquired
+            | Self::NothingToPresent
             | Self::TextureDestroyed => ErrorType::Validation,
         }
     }
@@ -276,21 +279,60 @@ impl Surface {
     pub fn present(&self) -> Result<Status, SurfaceError> {
         profiling::scope!("Surface::present");
 
-        let mut presentation = self.presentation.lock();
-        let present = match presentation.as_mut() {
+        let presentation = self.presentation.lock();
+        let present = match presentation.as_ref() {
             Some(present) => present,
             None => return Err(SurfaceError::NotConfigured),
         };
 
-        let device = &present.device;
+        present.device.check_is_valid()?;
+        let queue = present
+            .device
+            .get_queue()
+            .ok_or(SurfaceError::Device(DeviceError::Lost))?;
+        drop(presentation);
 
-        device.check_is_valid()?;
-        let queue = device.get_queue().unwrap();
+        queue.present(self)
+    }
+}
 
-        let texture = present
-            .acquired_texture
-            .take()
-            .ok_or(SurfaceError::AlreadyAcquired)?;
+impl Queue {
+    pub fn present(&self, surface: &Surface) -> Result<Status, SurfaceError> {
+        profiling::scope!("Queue::present");
+
+        let texture = {
+            let mut presentation = surface.presentation.lock();
+            let present = match presentation.as_mut() {
+                Some(present) => present,
+                None => return Err(SurfaceError::NotConfigured),
+            };
+
+            let device = &self.device;
+
+            // Check the surface is configured for this device.
+            if !Arc::ptr_eq(&present.device, device) {
+                return Err(SurfaceError::Device(DeviceError::DeviceMismatch(Box::new(
+                    crate::device::DeviceMismatch {
+                        res: self.error_ident(),
+                        res_device: device.error_ident(),
+                        target: None,
+                        target_device: present.device.error_ident(),
+                    },
+                ))));
+            }
+
+            present
+                .acquired_texture
+                .take()
+                .ok_or(SurfaceError::NothingToPresent)?
+        };
+
+        // If the texture was never rendered to, clear it and transition to
+        // PRESENT state before presenting.
+        // Fixes <https://github.com/gfx-rs/wgpu/issues/6748>
+        self.prepare_surface_texture_for_present(&texture)?;
+
+        let device = &self.device;
 
         let mut exclusive_snatch_guard = device.snatchable_lock.write();
         let inner = texture.inner.snatch(&mut exclusive_snatch_guard);
@@ -299,8 +341,8 @@ impl Surface {
         let result = match inner {
             None => return Err(SurfaceError::TextureDestroyed),
             Some(resource::TextureInner::Surface { raw }) => {
-                let raw_surface = self.raw(device.backend()).unwrap();
-                let raw_queue = queue.raw();
+                let raw_surface = surface.raw(device.backend()).unwrap();
+                let raw_queue = self.raw();
                 let _fence_lock = device.fence.write();
                 unsafe { raw_queue.present(raw_surface, raw) }
             }
@@ -324,7 +366,9 @@ impl Surface {
             },
         }
     }
+}
 
+impl Surface {
     pub fn discard(&self) -> Result<(), SurfaceError> {
         profiling::scope!("Surface::discard");
 
@@ -341,7 +385,7 @@ impl Surface {
         let texture = present
             .acquired_texture
             .take()
-            .ok_or(SurfaceError::AlreadyAcquired)?;
+            .ok_or(SurfaceError::NothingToPresent)?;
 
         let mut exclusive_snatch_guard = device.snatchable_lock.write();
         let inner = texture.inner.snatch(&mut exclusive_snatch_guard);
diff --git a/wgpu-hal/src/dx12/adapter.rs b/wgpu-hal/src/dx12/adapter.rs
index 0117b7c2d96..e21e303c1aa 100644
--- a/wgpu-hal/src/dx12/adapter.rs
+++ b/wgpu-hal/src/dx12/adapter.rs
@@ -1,5 +1,5 @@
 use alloc::{string::String, sync::Arc, vec::Vec};
-use core::ptr;
+use core::{ptr, sync::atomic::AtomicU64};
 use std::thread;
 
 use parking_lot::Mutex;
@@ -1073,11 +1073,21 @@ impl crate::Adapter for super::Adapter {
             self.compiler_container.clone(),
             self.options.clone(),
         )?;
+        let idle_fence: Direct3D12::ID3D12Fence = unsafe {
+            self.device
+                .CreateFence(0, Direct3D12::D3D12_FENCE_FLAG_NONE)
+        }
+        .into_device_result("Queue idle fence creation")?;
+        let idle_event = super::Event::create(false, false)?;
+
         Ok(crate::OpenDevice {
             device,
             queue: super::Queue {
                 raw: queue,
                 temp_lists: Mutex::new(Vec::new()),
+                idle_fence,
+                idle_event,
+                idle_fence_value: AtomicU64::new(0),
             },
         })
     }
diff --git a/wgpu-hal/src/dx12/mod.rs b/wgpu-hal/src/dx12/mod.rs
index 2afefbe4a6f..b2ae383bf77 100644
--- a/wgpu-hal/src/dx12/mod.rs
+++ b/wgpu-hal/src/dx12/mod.rs
@@ -88,7 +88,7 @@ mod types;
 mod view;
 
 use alloc::{borrow::ToOwned as _, string::String, sync::Arc, vec::Vec};
-use core::{ffi, fmt, mem, ops::Deref};
+use core::{ffi, fmt, mem, ops::Deref, sync::atomic::AtomicU64};
 
 use arrayvec::ArrayVec;
 use hashbrown::HashMap;
@@ -788,6 +788,9 @@ unsafe impl Sync for Device {}
 pub struct Queue {
     raw: Direct3D12::ID3D12CommandQueue,
     temp_lists: Mutex<Vec<Option<Direct3D12::ID3D12CommandList>>>,
+    idle_fence: Direct3D12::ID3D12Fence,
+    idle_event: Event,
+    idle_fence_value: AtomicU64,
 }
 
 impl Queue {
@@ -1660,6 +1663,22 @@ impl crate::Queue for Queue {
         let frequency = unsafe { self.raw.GetTimestampFrequency() }.expect("GetTimestampFrequency");
         (1_000_000_000.0 / frequency as f64) as f32
     }
+
+    unsafe fn wait_for_idle(&self) -> Result<(), crate::DeviceError> {
+        let value = self
+            .idle_fence_value
+            .fetch_add(1, core::sync::atomic::Ordering::Relaxed)
+            + 1;
+        unsafe { self.raw.Signal(&self.idle_fence, value) }
+            .into_device_result("Signal idle fence")?;
+        unsafe {
+            self.idle_fence
+                .SetEventOnCompletion(value, self.idle_event.0)
+        }
+        .into_device_result("SetEventOnCompletion")?;
+        unsafe { Threading::WaitForSingleObject(self.idle_event.0, Threading::INFINITE) };
+        Ok(())
+    }
 }
 #[derive(Debug)]
 pub struct DxilPassthroughShader {
diff --git a/wgpu-hal/src/dynamic/queue.rs b/wgpu-hal/src/dynamic/queue.rs
index 2af6d1f9c77..04744a9b4fa 100644
--- a/wgpu-hal/src/dynamic/queue.rs
+++ b/wgpu-hal/src/dynamic/queue.rs
@@ -20,6 +20,7 @@ pub trait DynQueue: DynResource {
         texture: Box<dyn DynSurfaceTexture>,
     ) -> Result<(), SurfaceError>;
     unsafe fn get_timestamp_period(&self) -> f32;
+    unsafe fn wait_for_idle(&self) -> Result<(), DeviceError>;
 }
 
 impl<Q: Queue + DynResource> DynQueue for Q {
@@ -53,4 +54,8 @@ impl<Q: Queue + DynResource> DynQueue for Q {
     unsafe fn get_timestamp_period(&self) -> f32 {
         unsafe { Q::get_timestamp_period(self) }
     }
+
+    unsafe fn wait_for_idle(&self) -> Result<(), DeviceError> {
+        unsafe { Q::wait_for_idle(self) }
+    }
 }
diff --git a/wgpu-hal/src/gles/queue.rs b/wgpu-hal/src/gles/queue.rs
index a67af60044c..09ce9da9cac 100644
--- a/wgpu-hal/src/gles/queue.rs
+++ b/wgpu-hal/src/gles/queue.rs
@@ -1963,6 +1963,12 @@ impl crate::Queue for super::Queue {
     unsafe fn get_timestamp_period(&self) -> f32 {
         1.0
     }
+
+    unsafe fn wait_for_idle(&self) -> Result<(), crate::DeviceError> {
+        let gl = &self.shared.context.lock();
+        unsafe { gl.finish() };
+        Ok(())
+    }
 }
 
 #[cfg(send_sync)]
diff --git a/wgpu-hal/src/lib.rs b/wgpu-hal/src/lib.rs
index 33991f8bd5d..d8dc3a7045c 100644
--- a/wgpu-hal/src/lib.rs
+++ b/wgpu-hal/src/lib.rs
@@ -1245,11 +1245,45 @@ pub trait Queue: WasmNotSendSync {
         surface_textures: &[&<Self::A as Api>::SurfaceTexture],
         signal_fence: (&mut <Self::A as Api>::Fence, FenceValue),
     ) -> Result<(), DeviceError>;
+    /// Present a surface texture to the screen.
+    ///
+    /// This consumes the surface texture, returning it to the swapchain.
+    ///
+    /// # Safety
+    ///
+    /// - `texture` must have been acquired from `surface` via
+    ///   [`Surface::acquire_texture`] and not yet presented or discarded.
+    /// - `surface` must be configured for use with the [`Device`][d] associated
+    ///   with this [`Queue`].
+    /// - `texture` must be in the "present" state. Either:
+    ///   - It was passed in [`submit`][s]'s `surface_textures` argument
+    ///     (which transitions it to the present state), or
+    ///   - The caller has otherwise transitioned it (e.g. via a clear +
+    ///     barrier to `PRESENT` for textures that were never rendered to).
+    /// - Any command buffers that write to `texture` must have been submitted
+    ///   via [`submit`][s] before this call. The submissions do not need to
+    ///   have completed on the GPU; platform-level synchronization handles the
+    ///   ordering between rendering and display.
+    /// - Must be externally synchronized with all other queue operations
+    ///   ([`submit`][s], [`present`][Queue::present],
+    ///   [`wait_for_idle`][Queue::wait_for_idle]) on the same queue.
+    ///
+    /// [d]: Api::Device
+    /// [s]: Queue::submit
     unsafe fn present(
         &self,
         surface: &<Self::A as Api>::Surface,
         texture: <Self::A as Api>::SurfaceTexture,
     ) -> Result<(), SurfaceError>;
+    /// Block until all previously submitted work on this queue has completed,
+    /// including any pending presentations.
+    ///
+    /// # Safety
+    ///
+    /// - Must be externally synchronized with all other queue operations
+    ///   ([`submit`][Queue::submit], [`present`][Queue::present],
+    ///   [`wait_for_idle`][Queue::wait_for_idle]) on the same queue.
+    unsafe fn wait_for_idle(&self) -> Result<(), DeviceError>;
     unsafe fn get_timestamp_period(&self) -> f32;
 }
 
diff --git a/wgpu-hal/src/metal/mod.rs b/wgpu-hal/src/metal/mod.rs
index 13974241180..c482b9a2a48 100644
--- a/wgpu-hal/src/metal/mod.rs
+++ b/wgpu-hal/src/metal/mod.rs
@@ -614,6 +614,16 @@ impl crate::Queue for Queue {
     unsafe fn get_timestamp_period(&self) -> f32 {
         self.timestamp_period
     }
+
+    unsafe fn wait_for_idle(&self) -> Result<(), crate::DeviceError> {
+        autoreleasepool(|_| {
+            let command_buffer = self.shared.raw.commandBuffer().unwrap();
+            command_buffer.setLabel(Some(ns_string!("(wgpu internal) wait_for_idle")));
+            command_buffer.commit();
+            command_buffer.waitUntilCompleted();
+        });
+        Ok(())
+    }
 }
 
 #[derive(Debug)]
diff --git a/wgpu-hal/src/noop/mod.rs b/wgpu-hal/src/noop/mod.rs
index 703e0b58d9c..563b1b27de9 100644
--- a/wgpu-hal/src/noop/mod.rs
+++ b/wgpu-hal/src/noop/mod.rs
@@ -286,6 +286,10 @@ impl crate::Queue for Context {
     unsafe fn get_timestamp_period(&self) -> f32 {
         1.0
     }
+
+    unsafe fn wait_for_idle(&self) -> Result<(), crate::DeviceError> {
+        Ok(())
+    }
 }
 
 impl crate::Device for Context {
diff --git a/wgpu-hal/src/vulkan/mod.rs b/wgpu-hal/src/vulkan/mod.rs
index 43915a7a0de..f90ad45d4b4 100644
--- a/wgpu-hal/src/vulkan/mod.rs
+++ b/wgpu-hal/src/vulkan/mod.rs
@@ -1365,6 +1365,11 @@ impl crate::Queue for Queue {
     unsafe fn get_timestamp_period(&self) -> f32 {
         self.device.timestamp_period
     }
+
+    unsafe fn wait_for_idle(&self) -> Result<(), crate::DeviceError> {
+        unsafe { self.device.raw.queue_wait_idle(self.raw) }
+            .map_err(map_host_device_oom_and_lost_err)
+    }
 }
 
 impl Queue {
diff --git a/wgpu/src/api/queue.rs b/wgpu/src/api/queue.rs
index aa5a0f24eda..2d0fbd70ce9 100644
--- a/wgpu/src/api/queue.rs
+++ b/wgpu/src/api/queue.rs
@@ -343,6 +343,21 @@ impl Queue {
         unsafe { queue.context.queue_as_hal::<A>(queue) }
     }
 
+    /// Schedule a surface texture to be presented on the owning surface.
+    ///
+    /// Should be called after any work on the texture is submitted via [`Queue::submit`].
+    /// If no work was submitted, the texture will be cleared automatically before presenting.
+    ///
+    /// # Platform dependent behavior
+    ///
+    /// On Wayland, `present` will attach a `wl_buffer` to the underlying `wl_surface` and commit the new surface
+    /// state. If it is desired to do things such as request a frame callback, scale the surface using the viewporter
+    /// or synchronize other double buffered state, then these operations should be done before the call to `present`.
+    pub fn present(&self, mut surface_texture: SurfaceTexture) {
+        surface_texture.presented = true;
+        self.inner.present(&surface_texture.detail);
+    }
+
     /// Compact a BLAS, it must have had [`Blas::prepare_compaction_async`] called on it and had the
     /// callback provided called.
     ///
diff --git a/wgpu/src/api/surface.rs b/wgpu/src/api/surface.rs
index 45ffd91e5d9..11f6f64564e 100644
--- a/wgpu/src/api/surface.rs
+++ b/wgpu/src/api/surface.rs
@@ -108,9 +108,8 @@ impl Surface<'_> {
 
     /// Returns the next texture to be presented by the surface for drawing.
     ///
-    /// In order to present the [`SurfaceTexture`] returned by this method,
-    /// first a [`Queue::submit`] needs to be done with some work rendering to this texture.
-    /// Then [`SurfaceTexture::present`] needs to be called.
+    /// After rendering to the returned [`SurfaceTexture`], submit work via [`Queue::submit`]
+    /// and then call [`Queue::present`] to display it.
     ///
     /// If a [`SurfaceTexture`] referencing this surface is alive when [`Surface::configure()`]
     /// is called, the configure call will panic.
diff --git a/wgpu/src/api/surface_texture.rs b/wgpu/src/api/surface_texture.rs
index f08c462127c..5126c10bb85 100644
--- a/wgpu/src/api/surface_texture.rs
+++ b/wgpu/src/api/surface_texture.rs
@@ -19,20 +19,6 @@ static_assertions::assert_impl_all!(SurfaceTexture: Send, Sync);
 crate::cmp::impl_eq_ord_hash_proxy!(SurfaceTexture => .texture.inner);
 
 impl SurfaceTexture {
-    /// Schedule this texture to be presented on the owning surface.
-    ///
-    /// Needs to be called after any work on the texture is scheduled via [`Queue::submit`].
-    ///
-    /// # Platform dependent behavior
-    ///
-    /// On Wayland, `present` will attach a `wl_buffer` to the underlying `wl_surface` and commit the new surface
-    /// state. If it is desired to do things such as request a frame callback, scale the surface using the viewporter
-    /// or synchronize other double buffered state, then these operations should be done before the call to `present`.
-    pub fn present(mut self) {
-        self.presented = true;
-        self.detail.present();
-    }
-
     #[cfg(custom)]
     /// Returns custom implementation of SurfaceTexture (if custom backend and is internally T)
     pub fn as_custom<T: crate::custom::SurfaceOutputDetailInterface>(&self) -> Option<&T> {
diff --git a/wgpu/src/backend/webgpu.rs b/wgpu/src/backend/webgpu.rs
index 2bb0913eb72..8512966c67c 100644
--- a/wgpu/src/backend/webgpu.rs
+++ b/wgpu/src/backend/webgpu.rs
@@ -2780,6 +2780,10 @@ impl dispatch::QueueInterface for WebQueue {
     ) -> (Option<u64>, dispatch::DispatchBlas) {
         unimplemented!("Raytracing not implemented for web")
     }
+
+    fn present(&self, _detail: &dispatch::DispatchSurfaceOutputDetail) {
+        // Swapchain is presented automatically on the web.
+    }
 }
 impl Drop for WebQueue {
     fn drop(&mut self) {
@@ -4006,10 +4010,6 @@ impl Drop for WebSurface {
 }
 
 impl dispatch::SurfaceOutputDetailInterface for WebSurfaceOutputDetail {
-    fn present(&self) {
-        // Swapchain is presented automatically on the web.
-    }
-
     fn texture_discard(&self) {
         // Can't really discard the texture on the web.
     }
diff --git a/wgpu/src/backend/wgpu_core.rs b/wgpu/src/backend/wgpu_core.rs
index 5cdad618278..04f116805e4 100644
--- a/wgpu/src/backend/wgpu_core.rs
+++ b/wgpu/src/backend/wgpu_core.rs
@@ -2142,6 +2142,17 @@ impl dispatch::QueueInterface for CoreQueue {
             .into(),
         )
     }
+
+    fn present(&self, detail: &dispatch::DispatchSurfaceOutputDetail) {
+        let detail = detail.as_core();
+        match self.context.0.surface_present(detail.surface_id) {
+            Ok(_status) => (),
+            Err(err) => {
+                self.context
+                    .handle_error_nolabel(&self.error_sink, err, "Queue::present");
+            }
+        }
+    }
 }
 
 impl Drop for CoreQueue {
@@ -3991,24 +4002,15 @@ impl Drop for CoreSurface {
 }
 
 impl dispatch::SurfaceOutputDetailInterface for CoreSurfaceOutputDetail {
-    fn present(&self) {
-        match self.context.0.surface_present(self.surface_id) {
+    fn texture_discard(&self) {
+        match self.context.0.surface_texture_discard(self.surface_id) {
             Ok(_status) => (),
             Err(err) => {
                 self.context
-                    .handle_error_nolabel(&self.error_sink, err, "Surface::present");
+                    .handle_error_nolabel(&self.error_sink, err, "Surface::discard_texture")
             }
         }
     }
-
-    fn texture_discard(&self) {
-        match self.context.0.surface_texture_discard(self.surface_id) {
-            Ok(_status) => (),
-            Err(err) => self
-                .context
-                .handle_error_fatal(err, "Surface::discard_texture"),
-        }
-    }
 }
 impl Drop for CoreSurfaceOutputDetail {
     fn drop(&mut self) {
diff --git a/wgpu/src/dispatch.rs b/wgpu/src/dispatch.rs
index 7df5b8c9882..099ad81b5a1 100644
--- a/wgpu/src/dispatch.rs
+++ b/wgpu/src/dispatch.rs
@@ -261,6 +261,8 @@ pub trait QueueInterface: CommonTraits {
     fn on_submitted_work_done(&self, callback: BoxSubmittedWorkDoneCallback);
 
     fn compact_blas(&self, blas: &DispatchBlas) -> (Option<u64>, DispatchBlas);
+
+    fn present(&self, detail: &DispatchSurfaceOutputDetail);
 }
 
 pub trait ShaderModuleInterface: CommonTraits {
@@ -588,7 +590,6 @@ pub trait SurfaceInterface: CommonTraits {
 }
 
 pub trait SurfaceOutputDetailInterface: CommonTraits {
-    fn present(&self);
     fn texture_discard(&self);
 }