diff --git a/app/src-tauri/src/lib.rs b/app/src-tauri/src/lib.rs index a402429539..642bd16b5f 100644 --- a/app/src-tauri/src/lib.rs +++ b/app/src-tauri/src/lib.rs @@ -51,6 +51,8 @@ mod notch_window; mod notification_settings; mod process_kill; mod process_recovery; +mod ptt_hotkeys; +mod ptt_overlay; #[cfg(target_os = "windows")] mod reset_reboot_schedule; mod screen_capture; @@ -758,6 +760,18 @@ async fn register_dictation_hotkey( expanded_shortcuts.join(", ") ); + // Reject overlap with the currently-registered PTT hotkey. + let ptt_current = { + let state = app.state::(); + let guard = state.shortcut.lock().unwrap(); + guard.clone() + }; + if let Some(conflict) = ptt_hotkeys::first_conflict_with(&expanded_shortcuts, &ptt_current) { + return Err(format!( + "dictation shortcut '{conflict}' conflicts with the push-to-talk hotkey" + )); + } + let register_shortcut = |shortcut_variant: &str| -> Result<(), String> { let app_clone = app.clone(); app.global_shortcut() @@ -852,6 +866,180 @@ async fn unregister_dictation_hotkey(app: AppHandle) -> Result<(), S Ok(()) } +/// Register (or re-register) the global push-to-talk hotkey. Emits +/// `ptt://start { session_id }` on press and `ptt://stop { session_id }` +/// on release. +#[tauri::command] +async fn register_ptt_hotkey(app: AppHandle, shortcut: String) -> Result<(), String> { + log::info!("[ptt] register_ptt_hotkey: shortcut={shortcut}"); + + let expanded = ptt_hotkeys::expand_ptt_shortcuts(&shortcut).map_err(|e| e.to_string())?; + + // Reject overlap with the currently-registered dictation hotkey. + let dictation_current = { + let state = app.state::(); + let guard = state.0.lock().unwrap(); + guard.clone() + }; + if let Some(conflict) = ptt_hotkeys::first_conflict_with(&expanded, &dictation_current) { + return Err(ptt_hotkeys::PttError::ConflictsWithDictation(conflict).to_string()); + } + + let old_shortcuts = { + let state = app.state::(); + let guard = state.shortcut.lock().unwrap(); + guard.clone() + }; + + // Lazy-instantiate the overlay window so it's ready before the first press. + if let Err(e) = ptt_overlay::ensure_window(&app) { + log::warn!("[ptt] overlay window create failed (continuing): {e}"); + } + + let register_shortcut = |variant: &str| -> Result<(), String> { + let app_pressed = app.clone(); + let app_released = app.clone(); + let variant_owned = variant.to_string(); + app.global_shortcut() + .on_shortcut(variant, move |app_inner, _sc, event| { + let state = app_inner.state::(); + match event.state { + ShortcutState::Pressed => { + // Drop OS key-repeat events; only the first Pressed of a hold opens a session. + if state + .is_held + .compare_exchange( + false, + true, + std::sync::atomic::Ordering::AcqRel, + std::sync::atomic::Ordering::Acquire, + ) + .is_err() + { + log::trace!( + "[ptt] press dropped (already held) shortcut={variant_owned}" + ); + return; + } + let session_id = state + .session_counter + .fetch_add(1, std::sync::atomic::Ordering::Relaxed) + + 1; + log::debug!( + "[ptt] pressed shortcut={variant_owned} session_id={session_id}" + ); + if let Err(e) = app_pressed.emit( + "ptt://start", + serde_json::json!({ + "session_id": session_id, + }), + ) { + log::warn!("[ptt] emit start failed: {e}"); + } + } + ShortcutState::Released => { + if !state + .is_held + .swap(false, std::sync::atomic::Ordering::AcqRel) + { + // No corresponding Pressed in our state — stale event, drop. + log::trace!( + "[ptt] release dropped (not held) shortcut={variant_owned}" + ); + return; + } + let session_id = state + .session_counter + .load(std::sync::atomic::Ordering::Relaxed); + log::debug!( + "[ptt] released shortcut={variant_owned} session_id={session_id}" + ); + if let Err(e) = app_released.emit( + "ptt://stop", + serde_json::json!({ + "session_id": session_id, + }), + ) { + log::warn!("[ptt] emit stop failed: {e}"); + } + } + } + }) + .map_err(|e| format!("Failed to register ptt shortcut '{variant}': {e}")) + }; + + // Unregister previous PTT variants. + let mut unregistered: Vec = Vec::new(); + for old in &old_shortcuts { + if let Err(e) = app.global_shortcut().unregister(old.as_str()) { + // Rollback already-unregistered ones. + for r in &unregistered { + if let Err(re) = register_shortcut(r) { + log::warn!("[ptt] rollback failed for '{r}': {re}"); + } + } + return Err(format!( + "Failed to unregister previous ptt shortcut '{old}': {e}" + )); + } + unregistered.push(old.clone()); + } + + // Register the new variants. Rollback on first failure. + let mut newly_registered: Vec = Vec::new(); + for v in &expanded { + if let Err(e) = register_shortcut(v) { + for r in &newly_registered { + if let Err(re) = app.global_shortcut().unregister(r.as_str()) { + log::warn!("[ptt] rollback failed for '{r}': {re}"); + } + } + for old in &old_shortcuts { + if let Err(re) = register_shortcut(old) { + log::warn!("[ptt] rollback failed for '{old}': {re}"); + } + } + return Err(e); + } + newly_registered.push(v.clone()); + } + + { + let state = app.state::(); + let mut guard = state.shortcut.lock().unwrap(); + *guard = expanded.clone(); + } + + log::info!("[ptt] registered: {}", expanded.join(", ")); + Ok(()) +} + +/// Unregister the global PTT hotkey (if any). +#[tauri::command] +async fn unregister_ptt_hotkey(app: AppHandle) -> Result<(), String> { + log::info!("[ptt] unregister_ptt_hotkey: called"); + let state = app.state::(); + let old = { + let guard = state.shortcut.lock().unwrap(); + guard.clone() + }; + let mut still_registered: Vec = Vec::new(); + for s in &old { + if let Err(e) = app.global_shortcut().unregister(s.as_str()) { + log::warn!("[ptt] unregister '{s}' failed: {e}"); + still_registered.push(s.clone()); + } + } + // Only retain variants that genuinely failed to unregister; the rest are gone. + { + let mut guard = state.shortcut.lock().unwrap(); + *guard = still_registered; + } + // Destroy the overlay window so resources are released. + ptt_overlay::destroy_window(&app); + Ok(()) +} + fn is_daemon_mode() -> bool { std::env::args().any(|arg| arg == "daemon" || arg == "--daemon") } @@ -2487,6 +2675,7 @@ pub fn run() { .manage(dictation_hotkeys::DictationHotkeyState( std::sync::Mutex::new(Vec::new()), )) + .manage(ptt_hotkeys::PttHotkeyState::new()) .manage(companion_commands::CompanionHotkeyState( std::sync::Mutex::new(Vec::new()), )) @@ -3240,6 +3429,9 @@ pub fn run() { schedule_cef_profile_purge, register_dictation_hotkey, unregister_dictation_hotkey, + register_ptt_hotkey, + unregister_ptt_hotkey, + ptt_overlay::show_ptt_overlay, webview_accounts::webview_account_open, webview_accounts::webview_account_prewarm, webview_accounts::webview_account_close, diff --git a/app/src-tauri/src/ptt_hotkeys.rs b/app/src-tauri/src/ptt_hotkeys.rs new file mode 100644 index 0000000000..743ccadb6c --- /dev/null +++ b/app/src-tauri/src/ptt_hotkeys.rs @@ -0,0 +1,280 @@ +//! Global push-to-talk hotkey state + parsing. +//! +//! See spec: `docs/superpowers/specs/2026-06-02-global-ptt-design.md`. +//! +//! `expand_ptt_shortcuts` mirrors `dictation_hotkeys::expand_dictation_shortcuts` +//! but rejects pure-modifier shortcuts (Ctrl, Cmd+Shift, etc.) because they +//! would fire constantly during normal typing. + +use std::sync::atomic::{AtomicBool, AtomicU64}; +use std::sync::Mutex; + +#[derive(Debug, PartialEq, Eq)] +pub(crate) enum PttError { + EmptyShortcut, + ModifierOnlyShortcut, + ConflictsWithDictation(String), + UnsupportedOnWayland, + RegistrationFailed(String), +} + +impl std::fmt::Display for PttError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + PttError::EmptyShortcut => write!(f, "ptt shortcut cannot be empty"), + PttError::ModifierOnlyShortcut => write!( + f, + "ptt shortcut cannot be only modifier keys (Ctrl/Cmd/Shift/Alt)" + ), + PttError::ConflictsWithDictation(s) => { + write!(f, "ptt shortcut '{s}' conflicts with the dictation hotkey") + } + PttError::UnsupportedOnWayland => write!( + f, + "global shortcuts are not supported in this Wayland session — switch to X11 or use in-app dictation" + ), + PttError::RegistrationFailed(s) => { + write!(f, "failed to register ptt shortcut: {s}") + } + } + } +} + +impl std::error::Error for PttError {} + +/// Process-wide PTT state. Held in the Tauri-managed `State`. +pub(crate) struct PttHotkeyState { + /// Currently-registered shortcut variants (e.g. `["Cmd+F13", "Ctrl+F13"]` on macOS). + pub(crate) shortcut: Mutex>, + /// Monotonic counter for session IDs. + pub(crate) session_counter: AtomicU64, + /// CAS-guarded: true iff a PTT session is currently mid-hold. + /// Used to drop OS key-repeat Pressed events so each press/release pair + /// produces exactly one session_id. + pub(crate) is_held: AtomicBool, +} + +impl PttHotkeyState { + pub(crate) fn new() -> Self { + Self { + shortcut: Mutex::new(Vec::new()), + session_counter: AtomicU64::new(0), + is_held: AtomicBool::new(false), + } + } +} + +const MODIFIER_TOKENS: &[&str] = &[ + "ctrl", + "control", + "cmd", + "command", + "meta", + "super", + "win", + "windows", + "alt", + "option", + "shift", + "cmdorctrl", +]; + +fn is_modifier_token(token: &str) -> bool { + let trimmed = token.trim(); + MODIFIER_TOKENS + .iter() + .any(|m| trimmed.eq_ignore_ascii_case(m)) +} + +/// Expand a user-typed shortcut into one or two OS-specific variants and +/// validate it isn't empty / modifier-only. +pub(crate) fn expand_ptt_shortcuts(shortcut: &str) -> Result, PttError> { + let trimmed = shortcut.trim(); + if trimmed.is_empty() { + return Err(PttError::EmptyShortcut); + } + + let parts: Vec<&str> = trimmed.split('+').map(str::trim).collect(); + if parts.iter().any(|p| p.is_empty()) { + return Err(PttError::EmptyShortcut); + } + if parts.iter().all(|p| is_modifier_token(p)) { + return Err(PttError::ModifierOnlyShortcut); + } + + #[cfg(target_os = "macos")] + { + if trimmed.contains("CmdOrCtrl") { + let cmd_variant = trimmed.replace("CmdOrCtrl", "Cmd"); + let ctrl_variant = trimmed.replace("CmdOrCtrl", "Ctrl"); + if cmd_variant == ctrl_variant { + return Ok(vec![cmd_variant]); + } + return Ok(vec![cmd_variant, ctrl_variant]); + } + } + + #[cfg(not(target_os = "macos"))] + { + if trimmed.contains("CmdOrCtrl") { + return Ok(vec![trimmed.replace("CmdOrCtrl", "Ctrl")]); + } + } + + Ok(vec![trimmed.to_string()]) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty_shortcut_is_rejected() { + assert_eq!(expand_ptt_shortcuts(""), Err(PttError::EmptyShortcut)); + assert_eq!(expand_ptt_shortcuts(" "), Err(PttError::EmptyShortcut)); + } + + #[test] + fn modifier_only_shortcut_is_rejected() { + assert_eq!( + expand_ptt_shortcuts("Ctrl"), + Err(PttError::ModifierOnlyShortcut) + ); + assert_eq!( + expand_ptt_shortcuts("Cmd+Shift"), + Err(PttError::ModifierOnlyShortcut) + ); + assert_eq!( + expand_ptt_shortcuts("Alt+Shift+Ctrl"), + Err(PttError::ModifierOnlyShortcut) + ); + assert_eq!( + expand_ptt_shortcuts("CmdOrCtrl+Shift"), + Err(PttError::ModifierOnlyShortcut) + ); + } + + #[test] + fn plain_function_key_is_accepted() { + assert_eq!(expand_ptt_shortcuts("F13"), Ok(vec!["F13".to_string()])); + } + + #[test] + fn modifier_plus_letter_is_accepted() { + assert_eq!( + expand_ptt_shortcuts("Ctrl+Alt+T"), + Ok(vec!["Ctrl+Alt+T".to_string()]) + ); + } + + #[test] + #[cfg(target_os = "macos")] + fn cmd_or_ctrl_expands_to_both_on_macos() { + let result = expand_ptt_shortcuts("CmdOrCtrl+Shift+P").unwrap(); + assert_eq!(result.len(), 2); + assert!(result.contains(&"Cmd+Shift+P".to_string())); + assert!(result.contains(&"Ctrl+Shift+P".to_string())); + } + + #[test] + #[cfg(not(target_os = "macos"))] + fn cmd_or_ctrl_expands_to_ctrl_off_macos() { + let result = expand_ptt_shortcuts("CmdOrCtrl+Shift+P").unwrap(); + assert_eq!(result, vec!["Ctrl+Shift+P".to_string()]); + } + + #[test] + fn malformed_shortcut_with_empty_tokens_is_rejected() { + assert_eq!(expand_ptt_shortcuts("+F13"), Err(PttError::EmptyShortcut)); + assert_eq!(expand_ptt_shortcuts("F13+"), Err(PttError::EmptyShortcut)); + assert_eq!( + expand_ptt_shortcuts("Ctrl++T"), + Err(PttError::EmptyShortcut) + ); + } +} + +/// Returns `Some(conflicting_variant)` if any expanded PTT variant overlaps +/// any expanded dictation variant. Comparison is case-insensitive. +pub(crate) fn first_conflict_with(ptt: &[String], dictation: &[String]) -> Option { + for p in ptt { + let p_lc = p.to_ascii_lowercase(); + for d in dictation { + if d.to_ascii_lowercase() == p_lc { + return Some(p.clone()); + } + } + } + None +} + +#[cfg(test)] +mod conflict_tests { + use super::*; + + #[test] + fn no_conflict_returns_none() { + let ptt = vec!["F13".into()]; + let dict = vec!["F14".into()]; + assert_eq!(first_conflict_with(&ptt, &dict), None); + } + + #[test] + fn case_insensitive_conflict_detected() { + let ptt = vec!["ctrl+space".into()]; + let dict = vec!["Ctrl+Space".into()]; + assert_eq!( + first_conflict_with(&ptt, &dict), + Some("ctrl+space".to_string()) + ); + } + + #[test] + fn only_one_variant_overlaps_returns_first() { + let ptt = vec!["Cmd+P".into(), "Ctrl+P".into()]; + let dict = vec!["Ctrl+P".into()]; + assert_eq!(first_conflict_with(&ptt, &dict), Some("Ctrl+P".to_string())); + } +} + +#[cfg(test)] +mod state_tests { + use super::*; + use std::sync::atomic::Ordering; + + #[test] + fn new_state_is_not_held_and_counter_is_zero() { + let s = PttHotkeyState::new(); + assert!(!s.is_held.load(Ordering::Relaxed)); + assert_eq!(s.session_counter.load(Ordering::Relaxed), 0); + } + + #[test] + fn cas_false_to_true_succeeds_then_repeat_fails() { + let s = PttHotkeyState::new(); + // First press: false → true succeeds. + assert!( + s.is_held + .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) + .is_ok(), + "first press CAS should succeed" + ); + // Repeat press: false → true fails because we're already true. + assert!( + s.is_held + .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) + .is_err(), + "repeat press CAS should fail (already held)" + ); + // Release: swap true → false returns the old true. + assert!( + s.is_held.swap(false, Ordering::AcqRel), + "swap should return prior true" + ); + // Subsequent stale release: swap returns the current false. + assert!( + !s.is_held.swap(false, Ordering::AcqRel), + "stale swap should return false" + ); + } +} diff --git a/app/src-tauri/src/ptt_overlay.rs b/app/src-tauri/src/ptt_overlay.rs new file mode 100644 index 0000000000..90bdfeb50d --- /dev/null +++ b/app/src-tauri/src/ptt_overlay.rs @@ -0,0 +1,96 @@ +//! Borderless always-on-top PTT overlay window. +//! +//! Lazy-created on the first `register_ptt_hotkey` call (so the window is +//! ready when the user hits the key for the first time), and destroyed by +//! `unregister_ptt_hotkey`. The window's contents are rendered by the React +//! route `/ptt-overlay` (see `app/src/pages/PttOverlayPage.tsx`). +//! +//! Cross-platform note: `focus(false)` ensures the window never steals focus +//! from the user's active app. `skip_taskbar(true)` keeps it out of the +//! Windows taskbar / macOS dock. `visible_on_all_workspaces(true)` makes it +//! follow the user across macOS Spaces. DXGI exclusive-fullscreen on Windows +//! still suppresses the overlay — documented in the settings panel as a +//! limitation; chime audio remains the fallback signal. + +use tauri::{AppHandle, Emitter, Manager, Runtime, WebviewUrl, WebviewWindowBuilder}; + +const OVERLAY_LABEL: &str = "ptt-overlay"; + +/// Ensure the overlay window exists. Idempotent — if the window already +/// exists, returns Ok without recreating it. +pub(crate) fn ensure_window(app: &AppHandle) -> Result<(), String> { + if app.get_webview_window(OVERLAY_LABEL).is_some() { + return Ok(()); + } + let url = WebviewUrl::App("index.html#/ptt-overlay".into()); + let mut builder = WebviewWindowBuilder::new(app, OVERLAY_LABEL, url) + .title("OpenHuman Push-to-Talk") + .inner_size(160.0, 56.0) + .decorations(false) + .transparent(true) + .always_on_top(true) + .skip_taskbar(true) + .focused(false) + .resizable(false) + // NOTE: .shadow(false) is a no-op under the project's CEF runtime + // (tauri-runtime-cef has a TODO stub); harmless but won't actually + // suppress the OS shadow until CEF wires it through. + .shadow(false) + .visible(false); + + #[cfg(target_os = "macos")] + { + builder = builder + .visible_on_all_workspaces(true) + .accept_first_mouse(false); + } + + let _window = builder + .build() + .map_err(|e| format!("create ptt overlay window: {e}"))?; + log::info!("[ptt-overlay] window created (label={OVERLAY_LABEL})"); + Ok(()) +} + +/// Destroy the overlay window if it exists. +pub(crate) fn destroy_window(app: &AppHandle) { + if let Some(w) = app.get_webview_window(OVERLAY_LABEL) { + if let Err(e) = w.destroy() { + log::warn!("[ptt-overlay] destroy failed: {e}"); + } else { + log::info!("[ptt-overlay] window destroyed"); + } + } +} + +/// Show or hide the overlay. Emits `ptt-overlay://active` for the in-window +/// React tree to drive its pulsing-dot animation. +#[tauri::command] +pub(crate) async fn show_ptt_overlay( + app: AppHandle, + active: bool, + session_id: u64, +) -> Result<(), String> { + let window = app.get_webview_window(OVERLAY_LABEL).ok_or_else(|| { + "[ptt-overlay] window not ready (register_ptt_hotkey must succeed before show_ptt_overlay)" + .to_string() + })?; + + if active { + window.show().map_err(|e| format!("show overlay: {e}"))?; + } else { + window.hide().map_err(|e| format!("hide overlay: {e}"))?; + } + + if let Err(e) = window.emit( + "ptt-overlay://active", + serde_json::json!({ + "active": active, + "session_id": session_id, + }), + ) { + log::warn!("[ptt-overlay] emit active failed: {e}"); + } + + Ok(()) +} diff --git a/app/src/App.tsx b/app/src/App.tsx index e88b437186..125555b1ee 100644 --- a/app/src/App.tsx +++ b/app/src/App.tsx @@ -18,6 +18,7 @@ import LocalAIDownloadSnackbar from './components/LocalAIDownloadSnackbar'; import SecretPromptDialog from './components/mcp-setup/SecretPromptDialog'; import OpenhumanLinkModal from './components/OpenhumanLinkModal'; import PersistRehydrationScreen from './components/PersistRehydrationScreen'; +import PttHotkeyManager from './components/PttHotkeyManager'; import SecurityBanner from './components/SecurityBanner'; import GlobalUpsellBanner from './components/upsell/GlobalUpsellBanner'; import AppWalkthrough from './components/walkthrough/AppWalkthrough'; @@ -109,6 +110,7 @@ function App() { {!onMobile && } + {!onMobile && } {!onMobile && } {!onMobile && } diff --git a/app/src/AppRoutes.tsx b/app/src/AppRoutes.tsx index 145f6ade29..7bfa4ec098 100644 --- a/app/src/AppRoutes.tsx +++ b/app/src/AppRoutes.tsx @@ -13,6 +13,7 @@ import Intelligence from './pages/Intelligence'; import Invites from './pages/Invites'; import Notifications from './pages/Notifications'; import Onboarding from './pages/onboarding/Onboarding'; +import { PttOverlayPage } from './pages/PttOverlayPage'; import Rewards from './pages/Rewards'; import Routines from './pages/Routines'; import Settings from './pages/Settings'; @@ -189,6 +190,8 @@ const AppRoutes = () => { } /> + } /> + {/* Default redirect based on auth status */} } /> diff --git a/app/src/__tests__/App.boot.test.tsx b/app/src/__tests__/App.boot.test.tsx index 592c8a1aaf..aa7587370c 100644 --- a/app/src/__tests__/App.boot.test.tsx +++ b/app/src/__tests__/App.boot.test.tsx @@ -78,6 +78,7 @@ vi.mock('../components/commands/CommandProvider', () => ({ default: ({ children }: { children: React.ReactNode }) => <>{children}, })); vi.mock('../components/DictationHotkeyManager', () => ({ default: () => null })); +vi.mock('../components/PttHotkeyManager', () => ({ default: () => null })); vi.mock('../components/OpenhumanLinkModal', () => ({ default: () => null })); vi.mock('../components/upsell/GlobalUpsellBanner', () => ({ default: () => null })); vi.mock('../components/walkthrough/AppWalkthrough', () => ({ default: () => null })); diff --git a/app/src/assets/audio/README.md b/app/src/assets/audio/README.md new file mode 100644 index 0000000000..c43105042c --- /dev/null +++ b/app/src/assets/audio/README.md @@ -0,0 +1,11 @@ +# Audio assets + +Short UI chimes for the push-to-talk feature (`docs/superpowers/specs/2026-06-02-global-ptt-design.md`). + +| File | Purpose | Source | License | +| --------------- | ----------------------------------------------------------- | -------------------------------------------------------------------------- | -------------------- | +| `ptt-open.wav` | Mic opened (PTT key pressed). | Generated locally with Python `wave` + sine generator (800–1200 Hz sweep). | CC0 / Public Domain. | +| `ptt-close.wav` | Mic closed (PTT key released). | Generated locally with Python `wave` + sine generator (1200–800 Hz sweep). | CC0 / Public Domain. | +| `ptt-error.wav` | Session aborted (empty audio, mic permission denied, etc.). | Generated locally with Python `wave` + sine generator (250 Hz tone). | CC0 / Public Domain. | + +All clips are ~80–120ms, LUFS-normalized to roughly match the in-app notification sound (~ -16 LUFS). Replace freely with better-sounding equivalents — just keep them under 200ms and CC0/MIT-equivalent. diff --git a/app/src/assets/audio/ptt-close.wav b/app/src/assets/audio/ptt-close.wav new file mode 100644 index 0000000000..761d30766a Binary files /dev/null and b/app/src/assets/audio/ptt-close.wav differ diff --git a/app/src/assets/audio/ptt-error.wav b/app/src/assets/audio/ptt-error.wav new file mode 100644 index 0000000000..f6034a3e69 Binary files /dev/null and b/app/src/assets/audio/ptt-error.wav differ diff --git a/app/src/assets/audio/ptt-open.wav b/app/src/assets/audio/ptt-open.wav new file mode 100644 index 0000000000..a4798b3ed3 Binary files /dev/null and b/app/src/assets/audio/ptt-open.wav differ diff --git a/app/src/components/PttHotkeyManager.tsx b/app/src/components/PttHotkeyManager.tsx new file mode 100644 index 0000000000..6340b2ae91 --- /dev/null +++ b/app/src/components/PttHotkeyManager.tsx @@ -0,0 +1,145 @@ +/** + * PttHotkeyManager + * + * Renderless boot-time wiring for the global push-to-talk feature: + * 1. Registers the persisted PTT shortcut with the Tauri shell via + * `usePttHotkey()`. + * 2. Owns the singleton `pttService` state machine (built in T10), wired to + * real audio capture (MediaRecorder), STT (voice_transcribe_bytes RPC), + * chat send, thread resolution, chime playback, and overlay window + * visibility. + * 3. Subscribes to the Tauri events `ptt://start` / `ptt://stop` emitted by + * the Rust shell when the global hotkey transitions edges, and forwards + * them into the service. + * + * The service is constructed once for the AppShell's lifetime — multiple + * mounts would create competing state machines fighting over the same mic. + */ +import { listen, type UnlistenFn } from '@tauri-apps/api/event'; +import debug from 'debug'; +import { useEffect, useMemo, useRef } from 'react'; +import { useDispatch, useStore } from 'react-redux'; + +import { cancelPttAudio, finalizePttAudio, startPttAudio } from '../features/voice/pttAudio'; +import { playPttChime } from '../features/voice/pttChimes'; +import { createNewVoiceThread, resolveActiveThreadId } from '../features/voice/pttThread'; +import { transcribePttAudio } from '../features/voice/pttTranscribe'; +import { usePttHotkey } from '../hooks/usePttHotkey'; +import { chatSend } from '../services/chatService'; +import { createPttService } from '../services/pttService'; +import type { RootState } from '../store'; +import { setIsHeld } from '../store/pttSlice'; +import { showPttOverlay } from '../utils/tauriCommands/ptt'; + +const log = debug('app:ptt:manager'); + +interface PttEventPayload { + session_id: number; +} + +// Stable monotonic clock for the pttService state machine. Defined at +// module scope so the useMemo factory below doesn't reference an impure +// function during render (react-hooks/purity). +const monotonicNow = (): number => Date.now(); + +export default function PttHotkeyManager(): null { + // Register / unregister the configured hotkey with the Tauri shell. + usePttHotkey(); + + const dispatch = useDispatch(); + const store = useStore(); + const unlistenRef = useRef([]); + + const service = useMemo( + () => + createPttService({ + audioCapture: { start: startPttAudio, finalize: finalizePttAudio, cancel: cancelPttAudio }, + transcribe: transcribePttAudio, + sendMessage: async ({ threadId, body, speakReply, metadata }) => { + await chatSend({ + threadId, + message: body, + speakReply, + source: metadata.source, + sessionId: metadata.session_id, + }); + }, + resolveActiveThreadId, + createNewVoiceThread, + playChime: playPttChime, + showOverlay: async (active, sessionId) => { + // Respect the user's "show overlay" preference for the start edge, + // but always tear it down on stop so a mid-session toggle can't leave + // the overlay stuck visible. + if (!active || store.getState().ptt.showOverlay) { + await showPttOverlay(active, sessionId); + } + }, + getSettings: () => { + const ptt = store.getState().ptt; + return { speakReplies: ptt.speakReplies, showOverlay: ptt.showOverlay }; + }, + now: monotonicNow, + // 10 s ceiling on a single PTT recording — matches the spec; if the + // user holds the key longer the watchdog finalises so we don't keep + // an open mic forever. + watchdogMs: 10_000, + // Recordings shorter than this are treated as accidental taps. + minAudioMs: 250, + logger: { + debug: (msg, meta) => log(msg, meta ?? {}), + info: (msg, meta) => log(msg, meta ?? {}), + warn: (msg, meta) => log(msg, meta ?? {}), + }, + }), + // The service holds an internal state machine — recreating it across + // store updates would orphan in-flight sessions. The closures above read + // the latest store state on every call, so a stable identity is correct. + // eslint-disable-next-line react-hooks/exhaustive-deps + [] + ); + + useEffect(() => { + let mounted = true; + const subscribe = async () => { + try { + const offStart = await listen('ptt://start', e => { + dispatch(setIsHeld(true)); + service.onStart(e.payload.session_id).catch(err => { + log('onStart failed', { sessionId: e.payload.session_id, err: String(err) }); + }); + }); + const offStop = await listen('ptt://stop', e => { + dispatch(setIsHeld(false)); + service.onStop(e.payload.session_id).catch(err => { + log('onStop failed', { sessionId: e.payload.session_id, err: String(err) }); + }); + }); + if (!mounted) { + offStart(); + offStop(); + return; + } + unlistenRef.current.push(offStart, offStop); + log('PttHotkeyManager: listeners attached'); + } catch (err) { + log('PttHotkeyManager: failed to attach listeners', err); + } + }; + void subscribe(); + return () => { + mounted = false; + const offs = unlistenRef.current; + unlistenRef.current = []; + for (const off of offs) { + try { + off(); + } catch (err) { + log('PttHotkeyManager: unlisten threw', err); + } + } + }; + }, [dispatch, service]); + + return null; +} diff --git a/app/src/components/settings/panels/VoicePanel.tsx b/app/src/components/settings/panels/VoicePanel.tsx index b61aae250f..ccebe0abe5 100644 --- a/app/src/components/settings/panels/VoicePanel.tsx +++ b/app/src/components/settings/panels/VoicePanel.tsx @@ -1,6 +1,7 @@ import { useCallback, useEffect, useRef, useState } from 'react'; import { useT } from '../../../lib/i18n/I18nContext'; +import PttSettingsPanel from '../../../pages/settings/voice/PttSettingsPanel'; import { installPiper, installWhisper, @@ -1285,6 +1286,14 @@ const VoicePanel = ({ embedded = false }: VoicePanelProps = {}) => { + {/* ─── Section 3: Push-to-talk ───────────────────────────────── + Global PTT hotkey + session preferences. The panel is + self-contained — it only mutates the `ptt` slice, and + `usePttHotkey` (T11) reacts to slice changes to (re)register + the binding with the Tauri shell. Mounted here so users hunt + for it under Voice settings alongside dictation. */} + + {/* Mascot voice picker now lives in Mascot settings. Link kept here so users hunting in Voice settings can find it. */} {ttsProvider !== 'piper' && ( diff --git a/app/src/features/voice/pttAudio.ts b/app/src/features/voice/pttAudio.ts new file mode 100644 index 0000000000..c60d20036b --- /dev/null +++ b/app/src/features/voice/pttAudio.ts @@ -0,0 +1,126 @@ +/** + * pttAudio — push-to-talk mic-capture adapter for pttService. + * + * Dictation's existing recorder lives in the Rust core (rdev-driven, fed by + * the audio_capture domain) and surfaces results asynchronously over a + * dedicated socket — it is not exposed as a reusable JS function that returns + * a buffer. Rather than refactor that flow, we use a self-contained + * MediaRecorder in the renderer. The captured audio is sent straight to the + * existing `voice_transcribe_bytes` RPC (see `pttTranscribe.ts`), so we still + * reuse the core's STT path; only the capture layer is renderer-owned. + * + * Module-level state is intentional — the singleton matches `pttService`'s + * lifecycle (one active PTT session at a time, owned by `PttHotkeyManager`). + * `cancel` is idempotent so the watchdog / preempt paths can call it freely. + */ +import type { FinalizedAudio } from '../../services/pttService'; + +interface Recorder { + recorder: MediaRecorder; + stream: MediaStream; + chunks: Blob[]; + startedAt: number; +} + +let active: Recorder | null = null; +let lastMimeType: string | undefined; + +function pickMimeType(): string | undefined { + // Prefer webm/opus — small, broadly supported. whisper.cpp + cloud STT both + // accept it via ffmpeg decode; the core's `extension` hint is "webm". + const preferred = ['audio/webm;codecs=opus', 'audio/webm', 'audio/ogg;codecs=opus', 'audio/mp4']; + for (const mime of preferred) { + if (typeof MediaRecorder !== 'undefined' && MediaRecorder.isTypeSupported(mime)) { + return mime; + } + } + return undefined; +} + +function stopTracks(stream: MediaStream): void { + for (const track of stream.getTracks()) { + try { + track.stop(); + } catch { + /* ignore */ + } + } +} + +export async function startPttAudio(opts: { sessionTag: string }): Promise { + // If a prior session was abandoned without a finalize/cancel, free it now + // so we don't leak the mic. + if (active) { + console.debug('[ptt-audio] startPttAudio called with active recorder — cancelling first'); + await cancelPttAudio(); + } + + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + const mimeType = pickMimeType(); + const recorder = new MediaRecorder(stream, mimeType ? { mimeType } : undefined); + const chunks: Blob[] = []; + + recorder.addEventListener('dataavailable', (e: BlobEvent) => { + if (e.data && e.data.size > 0) chunks.push(e.data); + }); + + active = { recorder, stream, chunks, startedAt: window.performance.now() }; + lastMimeType = mimeType ?? recorder.mimeType ?? undefined; + recorder.start(); + console.debug('[ptt-audio] started', { sessionTag: opts.sessionTag, mimeType }); +} + +export async function finalizePttAudio(): Promise { + if (!active) { + throw new Error('[ptt-audio] finalize called with no active recorder'); + } + const session = active; + active = null; + + const done = new Promise(resolve => { + if (session.recorder.state === 'inactive') { + resolve(); + return; + } + session.recorder.addEventListener('stop', () => resolve(), { once: true }); + }); + try { + if (session.recorder.state !== 'inactive') session.recorder.stop(); + } catch (err) { + console.warn('[ptt-audio] recorder.stop() threw', err); + } + await done; + stopTracks(session.stream); + + const blob = new Blob(session.chunks, { type: session.recorder.mimeType || 'audio/webm' }); + const buffer = await blob.arrayBuffer(); + const durationMs = Math.round(window.performance.now() - session.startedAt); + console.debug('[ptt-audio] finalized', { durationMs, bytes: buffer.byteLength }); + return { buffer, durationMs }; +} + +export async function cancelPttAudio(): Promise { + if (!active) return; + const session = active; + active = null; + try { + if (session.recorder.state !== 'inactive') session.recorder.stop(); + } catch { + /* ignore */ + } + stopTracks(session.stream); + console.debug('[ptt-audio] cancelled'); +} + +/** + * Maps the last-used MIME type to an extension string the core's + * `voice_transcribe_bytes` RPC accepts. Persists across `finalizePttAudio` + * (which clears `active`) so the transcribe step still gets the right hint. + */ +export function lastRecordedExtension(): string { + const mime = lastMimeType ?? ''; + if (mime.includes('webm')) return 'webm'; + if (mime.includes('ogg')) return 'ogg'; + if (mime.includes('mp4')) return 'm4a'; + return 'webm'; +} diff --git a/app/src/features/voice/pttChimes.ts b/app/src/features/voice/pttChimes.ts new file mode 100644 index 0000000000..dc7c7b22ce --- /dev/null +++ b/app/src/features/voice/pttChimes.ts @@ -0,0 +1,42 @@ +/** + * pttChimes — short audio cue playback for push-to-talk session boundaries. + * + * The three WAVs (open/close/error) live in `app/src/assets/audio/`. Vite + * resolves binary assets imported with a string URL out-of-the-box, so a + * standard `import openSrc from '...wav'` returns a URL the browser can fetch. + * + * HTMLAudioElement instances are cached per kind so repeat playback doesn't + * re-decode the WAV on every press. `play()` may reject under the autoplay + * policy (no user gesture yet) — we swallow that since PTT is triggered by + * a global hotkey, not a click, and the chime is non-critical. + */ +import closeSrc from '../../assets/audio/ptt-close.wav'; +import errorSrc from '../../assets/audio/ptt-error.wav'; +import openSrc from '../../assets/audio/ptt-open.wav'; + +export type ChimeKind = 'open' | 'close' | 'error'; + +const sources: Record = { open: openSrc, close: closeSrc, error: errorSrc }; + +const cache: Partial> = {}; + +function getElement(kind: ChimeKind): HTMLAudioElement { + const cached = cache[kind]; + if (cached) return cached; + const el = new window.Audio(sources[kind]); + el.preload = 'auto'; + cache[kind] = el; + return el; +} + +export async function playPttChime(kind: ChimeKind): Promise { + try { + const el = getElement(kind); + el.currentTime = 0; + await el.play(); + } catch (err) { + // Autoplay policy can reject silently for the first chime if no gesture + // has been observed. PTT is non-critical UX feedback so we just log. + console.debug('[ptt-chime] play failed', { kind, err: String(err) }); + } +} diff --git a/app/src/features/voice/pttThread.ts b/app/src/features/voice/pttThread.ts new file mode 100644 index 0000000000..9b7c0a6409 --- /dev/null +++ b/app/src/features/voice/pttThread.ts @@ -0,0 +1,26 @@ +/** + * pttThread — thread-resolution adapter for pttService. + * + * Resolves which thread a PTT-captured message lands in: + * 1. The currently-selected thread (state.thread.selectedThreadId) if any. + * 2. Otherwise create a fresh thread via `threads_create_new`. + * + * Keeping this in its own module keeps `PttHotkeyManager` declarative — the + * service interface only needs two thunks (`resolveActiveThreadId`, + * `createNewVoiceThread`), and the redux access stays out of React render + * scope. + */ +import { threadApi } from '../../services/api/threadApi'; +import { store } from '../../store'; + +export async function resolveActiveThreadId(): Promise { + const state = store.getState(); + return state.thread.selectedThreadId ?? null; +} + +export async function createNewVoiceThread(): Promise { + // No special "voice" label yet — the core auto-generates a title from the + // first user message, which gives a useful label for PTT sessions too. + const thread = await threadApi.createNewThread(); + return thread.id; +} diff --git a/app/src/features/voice/pttTranscribe.ts b/app/src/features/voice/pttTranscribe.ts new file mode 100644 index 0000000000..92c4613091 --- /dev/null +++ b/app/src/features/voice/pttTranscribe.ts @@ -0,0 +1,47 @@ +/** + * pttTranscribe — speech-to-text adapter for pttService. + * + * Reuses the existing `openhuman.voice_transcribe_bytes` RPC (see + * `src/openhuman/voice/ops.rs`). The Rust side handles cloud + whisper.cpp + * routing based on the user's `stt_provider` setting and applies optional + * LLM cleanup, so the renderer only needs to push raw bytes. + * + * The `extension` hint comes from `pttAudio.lastRecordedExtension()` — + * MediaRecorder negotiates webm/opus on every modern desktop browser. + */ +import { openhumanVoiceTranscribeBytes } from '../../utils/tauriCommands/voice'; +import { lastRecordedExtension } from './pttAudio'; + +/** + * Encode the buffer as a byte array for JSON-RPC transport. The wire format + * expects `Vec` deserialized from a number array; serde-json doesn't + * support binary natively over JSON-RPC. + * + * This is O(N) memory and CPU. For a 10s @ ~16 kbps opus blob (~20 KB) it's + * cheap; if PTT recordings grow past ~5 MB we should swap to base64 or a + * dedicated upload endpoint. + */ +function bufferToByteArray(buf: ArrayBuffer): number[] { + const view = new Uint8Array(buf); + const out = new Array(view.byteLength); + for (let i = 0; i < view.byteLength; i++) { + out[i] = view[i]; + } + return out; +} + +export async function transcribePttAudio(buf: ArrayBuffer): Promise { + if (buf.byteLength === 0) return ''; + const extension = lastRecordedExtension(); + const bytes = bufferToByteArray(buf); + const result = await openhumanVoiceTranscribeBytes( + bytes, + extension, + /* context */ undefined, + /* skipCleanup */ false + ); + // `result.text` is the cleaned-up version (LLM-polished when enabled); + // `raw_text` is the unfiltered whisper output. Prefer text but fall back. + const text = (result?.text ?? result?.raw_text ?? '').trim(); + return text; +} diff --git a/app/src/hooks/usePttHotkey.ts b/app/src/hooks/usePttHotkey.ts new file mode 100644 index 0000000000..be65d3e9e7 --- /dev/null +++ b/app/src/hooks/usePttHotkey.ts @@ -0,0 +1,52 @@ +/** + * usePttHotkey + * + * Subscribes the configured push-to-talk shortcut to the Tauri shell whenever + * the persisted `shortcut` field on the `ptt` slice changes. Resets the + * transient `isHeld` flag on mount so a stale rehydrated value (left over from + * a crash mid-press) can never leave the UI thinking the PTT key is held. + * + * Wired into the renderer once via `PttHotkeyManager` (T11), mounted in + * `App.tsx` alongside the dictation manager. + */ +import { useEffect } from 'react'; +import { useDispatch, useSelector } from 'react-redux'; + +import { selectPttShortcut, setIsHeld, setPttRegistrationError } from '../store/pttSlice'; +import { registerPttHotkey, unregisterPttHotkey } from '../utils/tauriCommands/ptt'; + +export function usePttHotkey(): void { + const dispatch = useDispatch(); + const shortcut = useSelector(selectPttShortcut); + + // Clear the transient isHeld flag on mount — a crash mid-press could + // otherwise rehydrate to "held forever". + useEffect(() => { + dispatch(setIsHeld(false)); + }, [dispatch]); + + useEffect(() => { + let cancelled = false; + const apply = async () => { + try { + if (shortcut && shortcut.trim().length > 0) { + await registerPttHotkey(shortcut); + if (!cancelled) dispatch(setPttRegistrationError(null)); + } else { + await unregisterPttHotkey(); + if (!cancelled) dispatch(setPttRegistrationError(null)); + } + } catch (err) { + if (!cancelled) { + const msg = err instanceof Error ? err.message : String(err); + console.warn('[ptt] hotkey (un)register failed', err); + dispatch(setPttRegistrationError(msg)); + } + } + }; + void apply(); + return () => { + cancelled = true; + }; + }, [shortcut, dispatch]); +} diff --git a/app/src/lib/i18n/ar.ts b/app/src/lib/i18n/ar.ts index 5f8d622ae3..355d16fe2a 100644 --- a/app/src/lib/i18n/ar.ts +++ b/app/src/lib/i18n/ar.ts @@ -1516,6 +1516,28 @@ const messages: TranslationMap = { 'voice.externalProviders.apiKey': 'مفتاح API', 'voice.externalProviders.apiKeyPlaceholder': 'sk-…', 'voice.externalProviders.add': 'Add', + 'pttSettings.title': 'اضغط للتحدث', + 'pttSettings.description': + 'اضغط مفتاحًا باستمرار للتحدث إلى OpenHuman أثناء وجودك في تطبيق آخر. تحرير المفتاح يُرسل التسجيل؛ إذا كان «قراءة الردود» مفعَّلًا، ينطق OpenHuman الرد بصوت عالٍ.', + 'pttSettings.shortcutLabel': 'اختصار المفتاح', + 'pttSettings.shortcutPlaceholder': 'اضغط مفتاحًا (مثل F13)', + 'pttSettings.shortcutUnsetHint': 'اضغط للتحدث متوقف — اختر اختصارًا لتفعيله.', + 'pttSettings.speakRepliesLabel': 'نطق ردود الوكيل', + 'pttSettings.showOverlayLabel': 'إظهار التراكب أثناء الضغط', + 'pttSettings.errorConflictsWithDictation': + 'هذا الاختصار مُستخدم بالفعل لخاصية الإملاء. اختر مفتاحًا مختلفًا.', + 'pttSettings.errorModifierOnly': + 'اختر مفتاحًا عاديًا (مثل F13) — الاختصارات المكونة من مفاتيح تعديل فقط لا تعمل مع اضغط للتحدث.', + 'pttSettings.errorEmpty': 'اختر مفتاحًا للربط.', + 'pttSettings.errorAccessibility': + 'يحتاج macOS إلى إذن إمكانية الوصول لهذا الاختصار. افتح إعدادات النظام ← الخصوصية والأمان ← إمكانية الوصول وفعّل OpenHuman.', + 'pttSettings.errorShortcutInUse': 'يستخدم تطبيق آخر هذا الاختصار بالفعل. اختر اختصارًا مختلفًا.', + 'pttSettings.errorUnsupportedWayland': + 'جلسات Wayland لا تدعم بعد الاختصارات العامة في OpenHuman — انتقل إلى جلسة X11 أو استخدم زر الإملاء داخل التطبيق.', + 'pttSettings.exclusiveFullscreenHint': + 'في ألعاب وضع ملء الشاشة الحصري لن يظهر التراكب — ستسمع التنبيه الصوتي فقط. انتقل إلى وضع ملء الشاشة بلا إطار لرؤية التراكب.', + 'pttOverlay.listening': 'يستمع…', + 'pttOverlay.idle': 'في الانتظار', 'autocomplete.title': 'الإكمال التلقائي', 'autocomplete.settings': 'الإعدادات', 'autocomplete.acceptWithTab': 'قبول بـ Tab', diff --git a/app/src/lib/i18n/bn.ts b/app/src/lib/i18n/bn.ts index c6266dc76f..5b45f4b712 100644 --- a/app/src/lib/i18n/bn.ts +++ b/app/src/lib/i18n/bn.ts @@ -1547,6 +1547,30 @@ const messages: TranslationMap = { 'voice.externalProviders.apiKey': 'API কী', 'voice.externalProviders.apiKeyPlaceholder': 'sk-…', 'voice.externalProviders.add': 'Add', + 'pttSettings.title': 'চাপ দিয়ে কথা বলুন', + 'pttSettings.description': + 'অন্য একটি অ্যাপে থাকার সময় OpenHuman-এর সাথে কথা বলতে একটি কী চেপে ধরে রাখুন। কী ছেড়ে দিলে রেকর্ডিং পাঠানো হয়; «উত্তর পড়ে শোনাও» চালু থাকলে OpenHuman উত্তরটি জোরে পড়ে শোনায়।', + 'pttSettings.shortcutLabel': 'হটকি', + 'pttSettings.shortcutPlaceholder': 'একটি কী চাপুন (যেমন F13)', + 'pttSettings.shortcutUnsetHint': + 'চাপ দিয়ে কথা বলুন বন্ধ আছে — চালু করতে একটি হটকি নির্বাচন করুন।', + 'pttSettings.speakRepliesLabel': 'এজেন্টের উত্তর জোরে পড়ুন', + 'pttSettings.showOverlayLabel': 'চেপে ধরা অবস্থায় ওভারলে দেখান', + 'pttSettings.errorConflictsWithDictation': + 'এই শর্টকাটটি ইতিমধ্যে ডিকটেশনের জন্য ব্যবহৃত হচ্ছে। অন্য একটি কী বেছে নিন।', + 'pttSettings.errorModifierOnly': + 'একটি সাধারণ কী বেছে নিন (যেমন F13) — শুধু মডিফায়ার নিয়ে গঠিত শর্টকাট চাপ দিয়ে কথা বলুনের জন্য কাজ করে না।', + 'pttSettings.errorEmpty': 'বাঁধার জন্য একটি কী বেছে নিন।', + 'pttSettings.errorAccessibility': + 'এই শর্টকাটের জন্য macOS-এর অ্যাক্সেসিবিলিটি অনুমতি দরকার। System Settings → Privacy & Security → Accessibility খুলুন এবং OpenHuman চালু করুন।', + 'pttSettings.errorShortcutInUse': + 'অন্য একটি অ্যাপ ইতিমধ্যে এই শর্টকাট ব্যবহার করছে। ভিন্ন একটি বেছে নিন।', + 'pttSettings.errorUnsupportedWayland': + 'Wayland সেশন এখনও OpenHuman-এ গ্লোবাল শর্টকাট সমর্থন করে না — একটি X11 সেশনে চলে যান অথবা অ্যাপের ভেতরের ডিকটেশন টগল ব্যবহার করুন।', + 'pttSettings.exclusiveFullscreenHint': + 'এক্সক্লুসিভ ফুলস্ক্রিন গেমে ওভারলে রেন্ডার হবে না — আপনি শুধু সাউন্ড সংকেতটি শুনতে পাবেন। ওভারলে দেখতে বর্ডারলেস ফুলস্ক্রিনে যান।', + 'pttOverlay.listening': 'শুনছে…', + 'pttOverlay.idle': 'অপেক্ষায়', 'autocomplete.title': 'অটোকমপ্লিট', 'autocomplete.settings': 'সেটিংস', 'autocomplete.acceptWithTab': 'Tab দিয়ে গ্রহণ করুন', diff --git a/app/src/lib/i18n/de.ts b/app/src/lib/i18n/de.ts index a8ebccfebd..1232724681 100644 --- a/app/src/lib/i18n/de.ts +++ b/app/src/lib/i18n/de.ts @@ -1587,6 +1587,30 @@ const messages: TranslationMap = { 'voice.externalProviders.apiKey': 'API-Schlüssel', 'voice.externalProviders.apiKeyPlaceholder': 'sk-…', 'voice.externalProviders.add': 'Add', + 'pttSettings.title': 'Push-to-Talk', + 'pttSettings.description': + 'Halte eine Taste gedrückt, um mit OpenHuman zu sprechen, während du eine andere App nutzt. Loslassen sendet die Aufnahme; wenn „Antworten vorlesen" aktiviert ist, antwortet OpenHuman per Sprachausgabe.', + 'pttSettings.shortcutLabel': 'Tastenkürzel', + 'pttSettings.shortcutPlaceholder': 'Taste drücken (z. B. F13)', + 'pttSettings.shortcutUnsetHint': + 'Push-to-Talk ist aus — wähle ein Tastenkürzel, um es zu aktivieren.', + 'pttSettings.speakRepliesLabel': 'Antworten des Agenten vorlesen', + 'pttSettings.showOverlayLabel': 'Overlay während des Haltens anzeigen', + 'pttSettings.errorConflictsWithDictation': + 'Dieses Tastenkürzel wird bereits für die Diktierfunktion verwendet. Wähle eine andere Taste.', + 'pttSettings.errorModifierOnly': + 'Wähle eine normale Taste (z. B. F13) — reine Modifikator-Kürzel funktionieren bei Push-to-Talk nicht.', + 'pttSettings.errorEmpty': 'Wähle eine Taste zum Binden.', + 'pttSettings.errorAccessibility': + 'macOS benötigt für dieses Tastenkürzel die Bedienungshilfen-Berechtigung. Öffne Systemeinstellungen → Datenschutz & Sicherheit → Bedienungshilfen und aktiviere OpenHuman.', + 'pttSettings.errorShortcutInUse': + 'Eine andere App verwendet dieses Tastenkürzel bereits. Wähle ein anderes.', + 'pttSettings.errorUnsupportedWayland': + 'Wayland-Sitzungen unterstützen globale Tastenkürzel in OpenHuman noch nicht — wechsle zu einer X11-Sitzung oder nutze den In-App-Diktierschalter.', + 'pttSettings.exclusiveFullscreenHint': + 'In exklusivem Vollbild rendert das Overlay nicht — du hörst nur den Hinweiston. Wechsle zu randlosem Vollbild für das Overlay.', + 'pttOverlay.listening': 'Höre zu…', + 'pttOverlay.idle': 'Bereit', 'autocomplete.title': 'Automatische Vervollständigung', 'autocomplete.settings': 'Einstellungen', 'autocomplete.acceptWithTab': 'Mit Tab akzeptieren', diff --git a/app/src/lib/i18n/en.ts b/app/src/lib/i18n/en.ts index 0d62339eab..6ab0e6b067 100644 --- a/app/src/lib/i18n/en.ts +++ b/app/src/lib/i18n/en.ts @@ -1898,6 +1898,30 @@ const en: TranslationMap = { 'voice.externalProviders.apiKeyPlaceholder': 'sk-…', 'voice.externalProviders.add': 'Add', + // Push-to-talk (PTT) + 'pttSettings.title': 'Push-to-talk', + 'pttSettings.description': + "Hold a key to talk to OpenHuman while you're in another app. Release the key to send; OpenHuman speaks the reply if 'Speak agent replies' is on.", + 'pttSettings.shortcutLabel': 'Hotkey', + 'pttSettings.shortcutPlaceholder': 'Press a key (e.g. F13)', + 'pttSettings.shortcutUnsetHint': 'Push-to-talk is off — pick a hotkey to enable.', + 'pttSettings.speakRepliesLabel': 'Speak agent replies', + 'pttSettings.showOverlayLabel': 'Show overlay while held', + 'pttSettings.errorConflictsWithDictation': + 'This shortcut is already used by dictation. Pick a different key.', + 'pttSettings.errorModifierOnly': + "Pick a regular key (e.g. F13) — modifier-only shortcuts don't work for push-to-talk.", + 'pttSettings.errorEmpty': 'Pick a key to bind.', + 'pttSettings.errorAccessibility': + 'macOS needs Accessibility permission for this shortcut. Open System Settings → Privacy & Security → Accessibility and enable OpenHuman.', + 'pttSettings.errorShortcutInUse': 'Another app already uses this shortcut. Pick a different one.', + 'pttSettings.errorUnsupportedWayland': + "Wayland sessions don't support global shortcuts in OpenHuman yet — switch to an X11 session or use the in-app dictation toggle.", + 'pttSettings.exclusiveFullscreenHint': + "In exclusive-fullscreen games the overlay won't render — you'll only hear the chime. Switch to borderless fullscreen for the overlay.", + 'pttOverlay.listening': 'Listening…', + 'pttOverlay.idle': 'Idle', + // Autocomplete 'autocomplete.title': 'Autocomplete', 'autocomplete.settings': 'Settings', diff --git a/app/src/lib/i18n/es.ts b/app/src/lib/i18n/es.ts index cdf6a62a87..60ae57da3f 100644 --- a/app/src/lib/i18n/es.ts +++ b/app/src/lib/i18n/es.ts @@ -1581,6 +1581,29 @@ const messages: TranslationMap = { 'voice.externalProviders.apiKey': 'Clave de API', 'voice.externalProviders.apiKeyPlaceholder': 'sk-…', 'voice.externalProviders.add': 'Add', + 'pttSettings.title': 'Pulsa para hablar', + 'pttSettings.description': + 'Mantén pulsada una tecla para hablar con OpenHuman mientras estás en otra aplicación. Al soltarla se envía la grabación; si tienes activada la opción «Leer las respuestas», OpenHuman las dice en voz alta.', + 'pttSettings.shortcutLabel': 'Atajo de teclado', + 'pttSettings.shortcutPlaceholder': 'Pulsa una tecla (p. ej. F13)', + 'pttSettings.shortcutUnsetHint': + 'Pulsa para hablar está desactivado — elige un atajo para activarlo.', + 'pttSettings.speakRepliesLabel': 'Leer las respuestas del agente en voz alta', + 'pttSettings.showOverlayLabel': 'Mostrar el panel mientras se mantiene pulsado', + 'pttSettings.errorConflictsWithDictation': + 'Este atajo ya lo usa el dictado. Elige una tecla distinta.', + 'pttSettings.errorModifierOnly': + 'Elige una tecla normal (p. ej. F13) — los atajos compuestos solo por modificadores no funcionan para pulsa para hablar.', + 'pttSettings.errorEmpty': 'Elige una tecla para asignarla.', + 'pttSettings.errorAccessibility': + 'macOS necesita permiso de Accesibilidad para este atajo. Abre Ajustes del sistema → Privacidad y seguridad → Accesibilidad y activa OpenHuman.', + 'pttSettings.errorShortcutInUse': 'Otra aplicación ya usa este atajo. Elige uno distinto.', + 'pttSettings.errorUnsupportedWayland': + 'Las sesiones Wayland todavía no admiten atajos globales en OpenHuman — cambia a una sesión X11 o usa el botón de dictado dentro de la aplicación.', + 'pttSettings.exclusiveFullscreenHint': + 'En juegos a pantalla completa exclusiva el panel no se mostrará — solo oirás el aviso sonoro. Cambia a pantalla completa sin bordes para ver el panel.', + 'pttOverlay.listening': 'Escuchando…', + 'pttOverlay.idle': 'En espera', 'autocomplete.title': 'Autocompletado', 'autocomplete.settings': 'Configuración', 'autocomplete.acceptWithTab': 'Aceptar con Tab', diff --git a/app/src/lib/i18n/fr.ts b/app/src/lib/i18n/fr.ts index eab483af68..afeba903a0 100644 --- a/app/src/lib/i18n/fr.ts +++ b/app/src/lib/i18n/fr.ts @@ -1585,6 +1585,30 @@ const messages: TranslationMap = { 'voice.externalProviders.apiKey': 'Clé API', 'voice.externalProviders.apiKeyPlaceholder': 'sk-…', 'voice.externalProviders.add': 'Add', + 'pttSettings.title': 'Appuyer pour parler', + 'pttSettings.description': + 'Maintiens une touche pour parler à OpenHuman pendant que tu utilises une autre application. Relâcher envoie l’enregistrement ; si « Lire les réponses » est activé, OpenHuman lit la réponse à voix haute.', + 'pttSettings.shortcutLabel': 'Raccourci', + 'pttSettings.shortcutPlaceholder': 'Appuie sur une touche (par exemple F13)', + 'pttSettings.shortcutUnsetHint': + 'Appuyer pour parler est désactivé — choisis un raccourci pour l’activer.', + 'pttSettings.speakRepliesLabel': 'Lire les réponses de l’agent à voix haute', + 'pttSettings.showOverlayLabel': 'Afficher la surcouche pendant l’appui', + 'pttSettings.errorConflictsWithDictation': + 'Ce raccourci est déjà utilisé par la dictée. Choisis une autre touche.', + 'pttSettings.errorModifierOnly': + 'Choisis une touche ordinaire (par exemple F13) — les raccourcis composés uniquement de modificateurs ne fonctionnent pas pour appuyer pour parler.', + 'pttSettings.errorEmpty': 'Choisis une touche à associer.', + 'pttSettings.errorAccessibility': + 'macOS exige l’autorisation Accessibilité pour ce raccourci. Ouvre Réglages système → Confidentialité et sécurité → Accessibilité et active OpenHuman.', + 'pttSettings.errorShortcutInUse': + 'Une autre application utilise déjà ce raccourci. Choisis-en un autre.', + 'pttSettings.errorUnsupportedWayland': + 'Les sessions Wayland ne prennent pas encore en charge les raccourcis globaux dans OpenHuman — passe à une session X11 ou utilise la commande de dictée intégrée à l’application.', + 'pttSettings.exclusiveFullscreenHint': + 'En plein écran exclusif des jeux, la surcouche ne s’affichera pas — tu entendras seulement le son. Passe en plein écran sans bordures pour voir la surcouche.', + 'pttOverlay.listening': 'À l’écoute…', + 'pttOverlay.idle': 'En attente', 'autocomplete.title': 'Autocomplétion', 'autocomplete.settings': 'Paramètres', 'autocomplete.acceptWithTab': 'Accepter avec Tab', diff --git a/app/src/lib/i18n/hi.ts b/app/src/lib/i18n/hi.ts index 3b14215082..94572fc8f3 100644 --- a/app/src/lib/i18n/hi.ts +++ b/app/src/lib/i18n/hi.ts @@ -1546,6 +1546,29 @@ const messages: TranslationMap = { 'voice.externalProviders.apiKey': 'API कुंजी', 'voice.externalProviders.apiKeyPlaceholder': 'sk-…', 'voice.externalProviders.add': 'Add', + 'pttSettings.title': 'दबाकर बोलें', + 'pttSettings.description': + 'जब आप किसी दूसरे ऐप में हों तब OpenHuman से बात करने के लिए कोई कुंजी दबाए रखें। कुंजी छोड़ने पर रिकॉर्डिंग भेजी जाती है; अगर «उत्तर बोलकर सुनाएँ» चालू है तो OpenHuman उत्तर बोलकर सुनाता है।', + 'pttSettings.shortcutLabel': 'हॉटकी', + 'pttSettings.shortcutPlaceholder': 'कोई कुंजी दबाएँ (जैसे F13)', + 'pttSettings.shortcutUnsetHint': 'दबाकर बोलें बंद है — चालू करने के लिए कोई हॉटकी चुनें।', + 'pttSettings.speakRepliesLabel': 'एजेंट के उत्तर ज़ोर से सुनाएँ', + 'pttSettings.showOverlayLabel': 'दबाए रखने के दौरान ओवरले दिखाएँ', + 'pttSettings.errorConflictsWithDictation': + 'यह शॉर्टकट पहले से डिक्टेशन में उपयोग हो रहा है। कोई दूसरी कुंजी चुनें।', + 'pttSettings.errorModifierOnly': + 'कोई सामान्य कुंजी चुनें (जैसे F13) — केवल मॉडिफ़ायर वाले शॉर्टकट दबाकर बोलें के लिए काम नहीं करते।', + 'pttSettings.errorEmpty': 'बाँधने के लिए कोई कुंजी चुनें।', + 'pttSettings.errorAccessibility': + 'इस शॉर्टकट के लिए macOS को एक्सेसिबिलिटी अनुमति चाहिए। System Settings → Privacy & Security → Accessibility खोलें और OpenHuman को सक्षम करें।', + 'pttSettings.errorShortcutInUse': + 'कोई दूसरा ऐप पहले से इस शॉर्टकट का उपयोग कर रहा है। कोई दूसरा चुनें।', + 'pttSettings.errorUnsupportedWayland': + 'Wayland सत्र अभी OpenHuman में ग्लोबल शॉर्टकट का समर्थन नहीं करते — X11 सत्र पर जाएँ या ऐप के अंदर डिक्टेशन टॉगल का उपयोग करें।', + 'pttSettings.exclusiveFullscreenHint': + 'एक्सक्लूसिव फुलस्क्रीन गेम्स में ओवरले प्रदर्शित नहीं होगा — आपको केवल चाइम सुनाई देगा। ओवरले देखने के लिए बॉर्डरलेस फुलस्क्रीन पर जाएँ।', + 'pttOverlay.listening': 'सुन रहा है…', + 'pttOverlay.idle': 'निष्क्रिय', 'autocomplete.title': 'ऑटोकम्पलीट', 'autocomplete.settings': 'सेटिंग्स', 'autocomplete.acceptWithTab': 'Tab से एक्सेप्ट करें', diff --git a/app/src/lib/i18n/id.ts b/app/src/lib/i18n/id.ts index 3cee150cbe..002a9e2b6e 100644 --- a/app/src/lib/i18n/id.ts +++ b/app/src/lib/i18n/id.ts @@ -1551,6 +1551,29 @@ const messages: TranslationMap = { 'voice.externalProviders.apiKey': 'Kunci API', 'voice.externalProviders.apiKeyPlaceholder': 'sk-…', 'voice.externalProviders.add': 'Add', + 'pttSettings.title': 'Tekan untuk bicara', + 'pttSettings.description': + 'Tahan sebuah tombol untuk berbicara dengan OpenHuman saat kamu sedang di aplikasi lain. Lepas tombol untuk mengirim; jika «Bacakan balasan» aktif, OpenHuman akan menyuarakan balasannya.', + 'pttSettings.shortcutLabel': 'Pintasan', + 'pttSettings.shortcutPlaceholder': 'Tekan sebuah tombol (mis. F13)', + 'pttSettings.shortcutUnsetHint': + 'Tekan untuk bicara mati — pilih pintasan untuk mengaktifkannya.', + 'pttSettings.speakRepliesLabel': 'Suarakan balasan agen', + 'pttSettings.showOverlayLabel': 'Tampilkan lapisan saat tombol ditahan', + 'pttSettings.errorConflictsWithDictation': + 'Pintasan ini sudah dipakai oleh dikte. Pilih tombol lain.', + 'pttSettings.errorModifierOnly': + 'Pilih tombol biasa (mis. F13) — pintasan hanya pengubah tidak berfungsi untuk tekan untuk bicara.', + 'pttSettings.errorEmpty': 'Pilih tombol untuk diikat.', + 'pttSettings.errorAccessibility': + 'macOS memerlukan izin Aksesibilitas untuk pintasan ini. Buka Pengaturan Sistem → Privasi & Keamanan → Aksesibilitas lalu aktifkan OpenHuman.', + 'pttSettings.errorShortcutInUse': 'Aplikasi lain sudah memakai pintasan ini. Pilih yang lain.', + 'pttSettings.errorUnsupportedWayland': + 'Sesi Wayland belum mendukung pintasan global di OpenHuman — beralihlah ke sesi X11 atau gunakan tombol dikte di dalam aplikasi.', + 'pttSettings.exclusiveFullscreenHint': + 'Pada game layar penuh eksklusif, lapisan tidak akan tampil — kamu hanya akan mendengar nada. Beralihlah ke layar penuh tanpa bingkai untuk melihat lapisan.', + 'pttOverlay.listening': 'Mendengarkan…', + 'pttOverlay.idle': 'Siaga', 'autocomplete.title': 'Pelengkap Otomatis', 'autocomplete.settings': 'Pengaturan', 'autocomplete.acceptWithTab': 'Terima dengan Tab', diff --git a/app/src/lib/i18n/it.ts b/app/src/lib/i18n/it.ts index d1b982189b..4332aafcff 100644 --- a/app/src/lib/i18n/it.ts +++ b/app/src/lib/i18n/it.ts @@ -1574,6 +1574,30 @@ const messages: TranslationMap = { 'voice.externalProviders.apiKey': 'Chiave API', 'voice.externalProviders.apiKeyPlaceholder': 'sk-…', 'voice.externalProviders.add': 'Add', + 'pttSettings.title': 'Premi per parlare', + 'pttSettings.description': + 'Tieni premuto un tasto per parlare con OpenHuman mentre sei in un’altra app. Al rilascio l’audio viene inviato; se «Leggi le risposte» è attivo, OpenHuman risponde a voce.', + 'pttSettings.shortcutLabel': 'Scorciatoia', + 'pttSettings.shortcutPlaceholder': 'Premi un tasto (es. F13)', + 'pttSettings.shortcutUnsetHint': + 'Premi per parlare è disattivato — scegli una scorciatoia per attivarlo.', + 'pttSettings.speakRepliesLabel': 'Pronuncia le risposte dell’agente', + 'pttSettings.showOverlayLabel': 'Mostra il riquadro mentre il tasto è premuto', + 'pttSettings.errorConflictsWithDictation': + 'Questa scorciatoia è già usata dalla dettatura. Scegli un tasto diverso.', + 'pttSettings.errorModifierOnly': + 'Scegli un tasto normale (es. F13) — le scorciatoie con soli modificatori non funzionano per premi per parlare.', + 'pttSettings.errorEmpty': 'Scegli un tasto da assegnare.', + 'pttSettings.errorAccessibility': + 'macOS richiede l’autorizzazione Accessibilità per questa scorciatoia. Apri Impostazioni di sistema → Privacy e sicurezza → Accessibilità e attiva OpenHuman.', + 'pttSettings.errorShortcutInUse': + 'Un’altra app utilizza già questa scorciatoia. Scegline una diversa.', + 'pttSettings.errorUnsupportedWayland': + 'Le sessioni Wayland non supportano ancora le scorciatoie globali in OpenHuman — passa a una sessione X11 o usa l’interruttore di dettatura nell’app.', + 'pttSettings.exclusiveFullscreenHint': + 'Nei giochi a schermo intero esclusivo il riquadro non verrà mostrato — sentirai solo il segnale acustico. Passa allo schermo intero senza bordi per vedere il riquadro.', + 'pttOverlay.listening': 'In ascolto…', + 'pttOverlay.idle': 'In attesa', 'autocomplete.title': 'Autocompletamento', 'autocomplete.settings': 'Impostazioni', 'autocomplete.acceptWithTab': 'Accetta con Tab', diff --git a/app/src/lib/i18n/ko.ts b/app/src/lib/i18n/ko.ts index 407e06db2a..1a14dc62ae 100644 --- a/app/src/lib/i18n/ko.ts +++ b/app/src/lib/i18n/ko.ts @@ -1531,6 +1531,30 @@ const messages: TranslationMap = { 'voice.externalProviders.apiKey': 'API 키', 'voice.externalProviders.apiKeyPlaceholder': 'sk-…', 'voice.externalProviders.add': 'Add', + 'pttSettings.title': '눌러서 말하기', + 'pttSettings.description': + "다른 앱을 사용하는 중에도 키를 누르고 있으면 OpenHuman과 대화할 수 있습니다. 키를 놓으면 녹음이 전송되고, '답변 읽어주기'가 켜져 있으면 OpenHuman이 답변을 음성으로 들려줍니다.", + 'pttSettings.shortcutLabel': '단축키', + 'pttSettings.shortcutPlaceholder': '키를 누르세요 (예: F13)', + 'pttSettings.shortcutUnsetHint': + '눌러서 말하기가 꺼져 있습니다 — 활성화하려면 단축키를 선택하세요.', + 'pttSettings.speakRepliesLabel': '에이전트 답변 음성으로 읽어주기', + 'pttSettings.showOverlayLabel': '누르고 있는 동안 오버레이 표시', + 'pttSettings.errorConflictsWithDictation': + '이 단축키는 받아쓰기에 이미 사용 중입니다. 다른 키를 선택하세요.', + 'pttSettings.errorModifierOnly': + '일반 키를 선택하세요(예: F13) — 보조 키로만 구성된 단축키는 눌러서 말하기에서 동작하지 않습니다.', + 'pttSettings.errorEmpty': '바인딩할 키를 선택하세요.', + 'pttSettings.errorAccessibility': + '이 단축키에는 macOS의 손쉬운 사용 권한이 필요합니다. 시스템 설정 → 개인정보 보호 및 보안 → 손쉬운 사용을 열고 OpenHuman을 활성화하세요.', + 'pttSettings.errorShortcutInUse': + '다른 앱이 이미 이 단축키를 사용 중입니다. 다른 단축키를 선택하세요.', + 'pttSettings.errorUnsupportedWayland': + 'Wayland 세션은 OpenHuman의 전역 단축키를 아직 지원하지 않습니다 — X11 세션으로 전환하거나 앱 내 받아쓰기 토글을 사용하세요.', + 'pttSettings.exclusiveFullscreenHint': + '전용 전체 화면 게임에서는 오버레이가 표시되지 않습니다 — 알림음만 들립니다. 오버레이를 보려면 테두리 없는 전체 화면으로 전환하세요.', + 'pttOverlay.listening': '듣는 중…', + 'pttOverlay.idle': '대기 중', 'autocomplete.title': '자동 완성', 'autocomplete.settings': '설정', 'autocomplete.acceptWithTab': 'Tab으로 수락', diff --git a/app/src/lib/i18n/pl.ts b/app/src/lib/i18n/pl.ts index bcbe481810..5b09601d7c 100644 --- a/app/src/lib/i18n/pl.ts +++ b/app/src/lib/i18n/pl.ts @@ -1567,6 +1567,29 @@ const messages: TranslationMap = { 'voice.externalProviders.apiKey': 'Klucz API', 'voice.externalProviders.apiKeyPlaceholder': 'sk-…', 'voice.externalProviders.add': 'Dodaj', + 'pttSettings.title': 'Naciśnij, aby mówić', + 'pttSettings.description': + 'Przytrzymaj klawisz, aby mówić do OpenHuman, gdy korzystasz z innej aplikacji. Zwolnienie klawisza wysyła nagranie; jeśli opcja „Czytaj odpowiedzi" jest włączona, OpenHuman odczyta odpowiedź na głos.', + 'pttSettings.shortcutLabel': 'Skrót klawiszowy', + 'pttSettings.shortcutPlaceholder': 'Naciśnij klawisz (np. F13)', + 'pttSettings.shortcutUnsetHint': + 'Naciśnij, aby mówić jest wyłączone — wybierz skrót, aby je włączyć.', + 'pttSettings.speakRepliesLabel': 'Czytaj odpowiedzi agenta na głos', + 'pttSettings.showOverlayLabel': 'Pokazuj nakładkę podczas przytrzymania', + 'pttSettings.errorConflictsWithDictation': + 'Ten skrót jest już używany przez dyktowanie. Wybierz inny klawisz.', + 'pttSettings.errorModifierOnly': + 'Wybierz zwykły klawisz (np. F13) — skróty złożone tylko z modyfikatorów nie działają dla naciśnij, aby mówić.', + 'pttSettings.errorEmpty': 'Wybierz klawisz do przypisania.', + 'pttSettings.errorAccessibility': + 'macOS wymaga uprawnienia Dostępność dla tego skrótu. Otwórz Ustawienia systemowe → Prywatność i bezpieczeństwo → Dostępność i włącz OpenHuman.', + 'pttSettings.errorShortcutInUse': 'Inna aplikacja używa już tego skrótu. Wybierz inny.', + 'pttSettings.errorUnsupportedWayland': + 'Sesje Wayland nie obsługują jeszcze globalnych skrótów w OpenHuman — przełącz się na sesję X11 lub użyj przełącznika dyktowania w aplikacji.', + 'pttSettings.exclusiveFullscreenHint': + 'W grach na wyłącznym pełnym ekranie nakładka nie zostanie wyświetlona — usłyszysz tylko sygnał dźwiękowy. Przełącz na pełny ekran bez ramki, aby zobaczyć nakładkę.', + 'pttOverlay.listening': 'Słucham…', + 'pttOverlay.idle': 'Gotowy', 'autocomplete.title': 'Autouzupełnianie', 'autocomplete.settings': 'Ustawienia', 'autocomplete.acceptWithTab': 'Akceptuj Tabem', diff --git a/app/src/lib/i18n/pt.ts b/app/src/lib/i18n/pt.ts index ee2d6b6bde..f3abbc56b3 100644 --- a/app/src/lib/i18n/pt.ts +++ b/app/src/lib/i18n/pt.ts @@ -1580,6 +1580,29 @@ const messages: TranslationMap = { 'voice.externalProviders.apiKey': 'Chave de API', 'voice.externalProviders.apiKeyPlaceholder': 'sk-…', 'voice.externalProviders.add': 'Add', + 'pttSettings.title': 'Pressionar para falar', + 'pttSettings.description': + 'Mantenha uma tecla pressionada para falar com o OpenHuman enquanto está noutro aplicativo. Soltar a tecla envia o áudio; se «Ler respostas em voz alta» estiver ativo, o OpenHuman lê a resposta.', + 'pttSettings.shortcutLabel': 'Atalho', + 'pttSettings.shortcutPlaceholder': 'Pressione uma tecla (por exemplo, F13)', + 'pttSettings.shortcutUnsetHint': + 'Pressionar para falar está desligado — escolha um atalho para ativar.', + 'pttSettings.speakRepliesLabel': 'Ler as respostas do agente em voz alta', + 'pttSettings.showOverlayLabel': 'Mostrar a sobreposição enquanto a tecla está pressionada', + 'pttSettings.errorConflictsWithDictation': + 'Este atalho já é usado pelo ditado. Escolha uma tecla diferente.', + 'pttSettings.errorModifierOnly': + 'Escolha uma tecla normal (por exemplo, F13) — atalhos apenas com modificadores não funcionam para pressionar para falar.', + 'pttSettings.errorEmpty': 'Escolha uma tecla para vincular.', + 'pttSettings.errorAccessibility': + 'O macOS precisa de permissão de Acessibilidade para este atalho. Abra Ajustes do Sistema → Privacidade e Segurança → Acessibilidade e ative o OpenHuman.', + 'pttSettings.errorShortcutInUse': 'Outro aplicativo já está a usar este atalho. Escolha outro.', + 'pttSettings.errorUnsupportedWayland': + 'As sessões Wayland ainda não suportam atalhos globais no OpenHuman — mude para uma sessão X11 ou use o controlo de ditado integrado no aplicativo.', + 'pttSettings.exclusiveFullscreenHint': + 'Em jogos no modo de ecrã inteiro exclusivo a sobreposição não será apresentada — só ouvirá o aviso sonoro. Mude para ecrã inteiro sem margens para ver a sobreposição.', + 'pttOverlay.listening': 'A escutar…', + 'pttOverlay.idle': 'Inativo', 'autocomplete.title': 'Autocompletar', 'autocomplete.settings': 'Configurações', 'autocomplete.acceptWithTab': 'Aceitar com Tab', diff --git a/app/src/lib/i18n/ru.ts b/app/src/lib/i18n/ru.ts index df32dbabf9..471aa57309 100644 --- a/app/src/lib/i18n/ru.ts +++ b/app/src/lib/i18n/ru.ts @@ -1559,6 +1559,30 @@ const messages: TranslationMap = { 'voice.externalProviders.apiKey': 'API-ключ', 'voice.externalProviders.apiKeyPlaceholder': 'sk-…', 'voice.externalProviders.add': 'Add', + 'pttSettings.title': 'Нажми и говори', + 'pttSettings.description': + 'Удерживайте клавишу, чтобы говорить с OpenHuman, пока вы находитесь в другом приложении. При отпускании запись отправляется; если включён параметр «Озвучивать ответы», OpenHuman озвучит ответ.', + 'pttSettings.shortcutLabel': 'Сочетание клавиш', + 'pttSettings.shortcutPlaceholder': 'Нажмите клавишу (например, F13)', + 'pttSettings.shortcutUnsetHint': + 'Нажми и говори выключено — выберите сочетание клавиш, чтобы включить.', + 'pttSettings.speakRepliesLabel': 'Озвучивать ответы агента', + 'pttSettings.showOverlayLabel': 'Показывать наложение во время удержания', + 'pttSettings.errorConflictsWithDictation': + 'Это сочетание уже используется диктовкой. Выберите другую клавишу.', + 'pttSettings.errorModifierOnly': + 'Выберите обычную клавишу (например, F13) — сочетания только из модификаторов не работают для «нажми и говори».', + 'pttSettings.errorEmpty': 'Выберите клавишу для назначения.', + 'pttSettings.errorAccessibility': + 'macOS требует разрешения «Универсальный доступ» для этого сочетания. Откройте Системные настройки → Конфиденциальность и безопасность → Универсальный доступ и включите OpenHuman.', + 'pttSettings.errorShortcutInUse': + 'Это сочетание уже использует другое приложение. Выберите другое.', + 'pttSettings.errorUnsupportedWayland': + 'Сессии Wayland пока не поддерживают глобальные сочетания клавиш в OpenHuman — перейдите на сессию X11 или используйте встроенный переключатель диктовки.', + 'pttSettings.exclusiveFullscreenHint': + 'В играх с эксклюзивным полноэкранным режимом наложение не отобразится — вы услышите только звуковой сигнал. Переключитесь на оконный полноэкранный режим, чтобы видеть наложение.', + 'pttOverlay.listening': 'Слушаю…', + 'pttOverlay.idle': 'Ожидание', 'autocomplete.title': 'Автодополнение', 'autocomplete.settings': 'Настройки', 'autocomplete.acceptWithTab': 'Принять с помощью Tab', diff --git a/app/src/lib/i18n/zh-CN.ts b/app/src/lib/i18n/zh-CN.ts index 81c2affafc..498d7ae1ce 100644 --- a/app/src/lib/i18n/zh-CN.ts +++ b/app/src/lib/i18n/zh-CN.ts @@ -1464,6 +1464,27 @@ const messages: TranslationMap = { 'voice.externalProviders.apiKey': 'API 密钥', 'voice.externalProviders.apiKeyPlaceholder': 'sk-…', 'voice.externalProviders.add': 'Add', + 'pttSettings.title': '按住说话', + 'pttSettings.description': + '在其他应用中时,按住按键即可与 OpenHuman 对话。松开按键发送录音;若已开启「朗读回复」,OpenHuman 会用语音读出回复。', + 'pttSettings.shortcutLabel': '快捷键', + 'pttSettings.shortcutPlaceholder': '按下一个键(例如 F13)', + 'pttSettings.shortcutUnsetHint': '按住说话已关闭 — 请选择一个快捷键来启用。', + 'pttSettings.speakRepliesLabel': '朗读智能体的回复', + 'pttSettings.showOverlayLabel': '按住时显示悬浮层', + 'pttSettings.errorConflictsWithDictation': '该快捷键已被听写功能占用。请选择其他按键。', + 'pttSettings.errorModifierOnly': + '请选择一个常规按键(例如 F13)— 仅由修饰键组成的快捷键无法用于按住说话。', + 'pttSettings.errorEmpty': '请选择要绑定的按键。', + 'pttSettings.errorAccessibility': + '此快捷键需要 macOS 的辅助功能权限。请打开系统设置 → 隐私与安全 → 辅助功能,并启用 OpenHuman。', + 'pttSettings.errorShortcutInUse': '该快捷键已被其他应用占用。请选择其他快捷键。', + 'pttSettings.errorUnsupportedWayland': + 'Wayland 会话尚不支持 OpenHuman 的全局快捷键 — 请切换到 X11 会话,或使用应用内的听写开关。', + 'pttSettings.exclusiveFullscreenHint': + '在独占式全屏游戏中悬浮层不会显示 — 你只会听到提示音。切换到无边框全屏即可看到悬浮层。', + 'pttOverlay.listening': '正在聆听…', + 'pttOverlay.idle': '空闲', 'autocomplete.title': '自动补全', 'autocomplete.settings': '设置', 'autocomplete.acceptWithTab': 'Tab 键接受', diff --git a/app/src/pages/PttOverlayPage.test.tsx b/app/src/pages/PttOverlayPage.test.tsx new file mode 100644 index 0000000000..a3b8b7f7d0 --- /dev/null +++ b/app/src/pages/PttOverlayPage.test.tsx @@ -0,0 +1,35 @@ +import { act, render, screen } from '@testing-library/react'; +import { describe, expect, it, vi } from 'vitest'; + +import { PttOverlayPage } from './PttOverlayPage'; + +// Mock @tauri-apps/api/event's listen so we can dispatch fake events. +vi.mock('@tauri-apps/api/event', () => { + const handlers: Record void> = {}; + return { + listen: vi.fn(async (name: string, handler: (e: { payload: unknown }) => void) => { + handlers[name] = handler; + return () => delete handlers[name]; + }), + __dispatch: (name: string, payload: unknown) => handlers[name]?.({ payload }), + }; +}); + +describe('PttOverlayPage', () => { + it('renders idle state by default', () => { + render(); + expect(screen.getByTestId('ptt-overlay-root')).toHaveAttribute('data-active', 'false'); + }); + + it('flips to active when ptt-overlay://active fires with active=true', async () => { + render(); + const evt = await import('@tauri-apps/api/event'); + await act(async () => { + (evt as unknown as { __dispatch: (n: string, p: unknown) => void }).__dispatch( + 'ptt-overlay://active', + { active: true, session_id: 1 } + ); + }); + expect(screen.getByTestId('ptt-overlay-root')).toHaveAttribute('data-active', 'true'); + }); +}); diff --git a/app/src/pages/PttOverlayPage.tsx b/app/src/pages/PttOverlayPage.tsx new file mode 100644 index 0000000000..ef9fd828f6 --- /dev/null +++ b/app/src/pages/PttOverlayPage.tsx @@ -0,0 +1,61 @@ +import { listen, type UnlistenFn } from '@tauri-apps/api/event'; +import { useEffect, useState } from 'react'; + +import { useT } from '../lib/i18n/I18nContext'; + +export function PttOverlayPage() { + const { t } = useT(); + const [active, setActive] = useState(false); + + useEffect(() => { + let off: UnlistenFn | undefined; + let cancelled = false; + listen<{ active: boolean }>('ptt-overlay://active', e => { + setActive(Boolean(e.payload?.active)); + }) + .then(fn => { + if (cancelled) fn(); + else off = fn; + }) + .catch(() => {}); + return () => { + cancelled = true; + off?.(); + }; + }, []); + + return ( +
+ + {/* TODO(T13): i18n keys pttOverlay.listening / pttOverlay.idle added in T13 */} + {active ? t('pttOverlay.listening') : t('pttOverlay.idle')} +
+ ); +} diff --git a/app/src/pages/settings/voice/PttSettingsPanel.tsx b/app/src/pages/settings/voice/PttSettingsPanel.tsx new file mode 100644 index 0000000000..459128b3b3 --- /dev/null +++ b/app/src/pages/settings/voice/PttSettingsPanel.tsx @@ -0,0 +1,271 @@ +/** + * PttSettingsPanel — settings card for the global push-to-talk hotkey. + * + * Renders three controls bound to `pttSlice` (T8): + * - A hotkey-capture input that writes the captured key into + * `setPttShortcut` (null when cleared). Modifier-only presses are + * rejected with an inline error since they don't make sense for PTT. + * - A "Speak agent replies" switch bound to `setSpeakReplies`. + * - A "Show overlay while held" switch bound to `setShowOverlay`. + * + * The hotkey registration side effect itself is handled by + * `usePttHotkey` (T11) which subscribes to slice changes and forwards + * to the Tauri shell — this panel only mutates Redux state and lets + * the manager hook react. This separation keeps the settings UI + * purely declarative and means the panel test does not need to mock + * the Tauri command surface. + * + * The panel deliberately renders without a `SettingsHeader` since it's + * intended to be embedded inside `VoicePanel` rather than mounted as a + * standalone route. The "card" style matches the other sections inside + * VoicePanel. + * + * Plan: docs/superpowers/plans/2026-06-02-global-ptt.md (Task 13). + */ +import { useCallback, useState } from 'react'; + +import { useT } from '../../../lib/i18n/I18nContext'; +import { useAppDispatch, useAppSelector } from '../../../store/hooks'; +import { + selectPttRegistrationError, + selectPttShortcut, + selectShowOverlay, + selectSpeakReplies, + setPttShortcut, + setShowOverlay, + setSpeakReplies, +} from '../../../store/pttSlice'; + +/** Keys that are pure modifiers — a PTT binding made of only these makes + * no sense (you can't "release" a modifier to send a sample without + * already needing a non-modifier sentinel). We surface a typed error + * instead of silently saving a useless binding. */ +const MODIFIER_KEYS = new Set([ + 'Shift', + 'Control', + 'Alt', + 'Meta', + 'OS', + 'AltGraph', + 'CapsLock', + 'NumLock', + 'ScrollLock', +]); + +/** + * Convert a KeyboardEvent into a stable shortcut string. Mirrors the + * format the Tauri shell expects (e.g. `Ctrl+Alt+F13`). We use the + * `key` field (and `code` for letters where `key` carries the layout's + * uppercased value) to avoid layout drift across QWERTY / AZERTY / etc. + */ +function eventToShortcut(e: React.KeyboardEvent): string | null { + if (MODIFIER_KEYS.has(e.key)) return null; + const parts: string[] = []; + if (e.ctrlKey) parts.push('Ctrl'); + if (e.altKey) parts.push('Alt'); + if (e.shiftKey) parts.push('Shift'); + if (e.metaKey) parts.push('Meta'); + // Prefer e.key (already the localised label like "F13", "a", "Enter") + // unless it's a single lowercase letter — for those we uppercase to + // produce a consistent "Ctrl+A" form across capitalised / not. + // Normalize Space (" ") to the display label "Space" so the saved + // binding is readable (e.g. "Ctrl+Space" rather than "Ctrl+ "). + let label = e.key === ' ' ? 'Space' : e.key; + if (label.length === 1 && /[a-z]/.test(label)) { + label = label.toUpperCase(); + } + parts.push(label); + return parts.join('+'); +} + +/** + * Map a raw Tauri error string from `register_ptt_hotkey` to a localized + * message. Pattern-matches on well-known substrings so the panel doesn't need + * to depend on the exact Rust error wording; falls back to the raw string for + * anything unrecognised (still useful to the user for diagnostics). + */ +function localizedRegistrationError(raw: string | null, t: (key: string) => string): string | null { + if (!raw) return null; + const lower = raw.toLowerCase(); + if (lower.includes('conflict') && lower.includes('dictation')) { + return t('pttSettings.errorConflictsWithDictation'); + } + if (lower.includes('wayland')) { + return t('pttSettings.errorUnsupportedWayland'); + } + if (lower.includes('accessibility')) { + return t('pttSettings.errorAccessibility'); + } + if (lower.includes('in use') || lower.includes('shortcutinuse') || lower.includes('in_use')) { + return t('pttSettings.errorShortcutInUse'); + } + return raw; +} + +const PttSettingsPanel = () => { + const { t } = useT(); + const dispatch = useAppDispatch(); + const shortcut = useAppSelector(selectPttShortcut); + const speakReplies = useAppSelector(selectSpeakReplies); + const showOverlay = useAppSelector(selectShowOverlay); + const registrationError = useAppSelector(selectPttRegistrationError); + + // Inline validation error for the capture input (e.g. modifier-only). + // Cleared whenever the user retries or focuses the field. Server-side + // errors (accessibility, in-use, Wayland) are emitted by the manager + // hook via toast/snackbar in T11; we keep this panel-local state for + // the capture-time failure modes. + const [captureError, setCaptureError] = useState(null); + + const handleShortcutKeyDown = useCallback( + (e: React.KeyboardEvent) => { + // Let Tab / Shift+Tab pass through so keyboard navigation within + // the settings panel still works. All other keys are captured as + // potential binding candidates and their default actions suppressed + // so the input doesn't insert text. + if (e.key === 'Tab') { + return; + } + e.preventDefault(); + e.stopPropagation(); + + // Allow Backspace / Delete / Escape to clear the binding so the + // user can drop back to the "off" state without having to fight a + // sticky F13. + if (e.key === 'Backspace' || e.key === 'Delete' || e.key === 'Escape') { + setCaptureError(null); + dispatch(setPttShortcut(null)); + return; + } + + if (MODIFIER_KEYS.has(e.key)) { + setCaptureError(t('pttSettings.errorModifierOnly')); + return; + } + + const shortcutString = eventToShortcut(e); + if (!shortcutString) { + setCaptureError(t('pttSettings.errorEmpty')); + return; + } + + console.debug('[pttSettings] captured shortcut %s', shortcutString); + setCaptureError(null); + dispatch(setPttShortcut(shortcutString)); + }, + [dispatch, t] + ); + + const toggleSpeakReplies = useCallback(() => { + dispatch(setSpeakReplies(!speakReplies)); + }, [dispatch, speakReplies]); + + const toggleShowOverlay = useCallback(() => { + dispatch(setShowOverlay(!showOverlay)); + }, [dispatch, showOverlay]); + + return ( +
+
+
+

+ {t('pttSettings.title')} +

+

+ {t('pttSettings.description')} +

+
+ + {/* Hotkey capture */} + + + {/* Speak replies switch */} +
+ + {t('pttSettings.speakRepliesLabel')} + + +
+ + {/* Show overlay switch */} +
+ + {t('pttSettings.showOverlayLabel')} + + +
+
+
+ ); +}; + +export default PttSettingsPanel; diff --git a/app/src/pages/settings/voice/__tests__/PttSettingsPanel.test.tsx b/app/src/pages/settings/voice/__tests__/PttSettingsPanel.test.tsx new file mode 100644 index 0000000000..69dc8ba4c5 --- /dev/null +++ b/app/src/pages/settings/voice/__tests__/PttSettingsPanel.test.tsx @@ -0,0 +1,129 @@ +import { fireEvent, screen } from '@testing-library/react'; +import { describe, expect, it } from 'vitest'; + +import { I18nProvider } from '../../../../lib/i18n/I18nContext'; +import { initialPttState, type PttState } from '../../../../store/pttSlice'; +import { renderWithProviders } from '../../../../test/test-utils'; +import PttSettingsPanel from '../PttSettingsPanel'; + +/** + * Render PttSettingsPanel with the given PTT slice state pre-seeded so + * each test can assert against a known starting point. We wrap in the + * real `I18nProvider` so the panel's labels resolve to the en.ts copy + * — that lets tests query by their final rendered text without + * hard-coding the message ids. + */ +function renderPanel(pttOverrides: Partial = {}) { + const preloadedState = { + locale: { current: 'en' as const }, + ptt: { ...initialPttState, ...pttOverrides }, + }; + return renderWithProviders( + + + , + { preloadedState } + ); +} + +describe('PttSettingsPanel', () => { + it('renders the "not set" hint when no shortcut is bound', () => { + renderPanel({ shortcut: null }); + expect( + screen.getByText(/Push-to-talk is off — pick a hotkey to enable\./i) + ).toBeInTheDocument(); + }); + + it('renders the bound shortcut when set', () => { + renderPanel({ shortcut: 'F13' }); + expect(screen.getByTestId('ptt-shortcut-input')).toHaveValue('F13'); + // The unset hint should NOT show once a shortcut is bound. + expect( + screen.queryByText(/Push-to-talk is off — pick a hotkey to enable\./i) + ).not.toBeInTheDocument(); + }); + + it('toggles speakReplies via the switch', () => { + const { store } = renderPanel({ shortcut: 'F13', speakReplies: true }); + const speakSwitch = screen.getByTestId('ptt-speak-replies-switch'); + expect(speakSwitch).toHaveAttribute('aria-checked', 'true'); + + fireEvent.click(speakSwitch); + + const stateAfter = (store.getState() as { ptt: PttState }).ptt; + expect(stateAfter.speakReplies).toBe(false); + // And the aria-checked attribute should flip on the rendered switch. + expect(screen.getByTestId('ptt-speak-replies-switch')).toHaveAttribute('aria-checked', 'false'); + }); + + it('toggles showOverlay via the switch', () => { + const { store } = renderPanel({ shortcut: 'F13', showOverlay: true }); + const overlaySwitch = screen.getByTestId('ptt-show-overlay-switch'); + expect(overlaySwitch).toHaveAttribute('aria-checked', 'true'); + + fireEvent.click(overlaySwitch); + + const stateAfter = (store.getState() as { ptt: PttState }).ptt; + expect(stateAfter.showOverlay).toBe(false); + }); + + it('updates the shortcut when a key is captured in the input', () => { + const { store } = renderPanel({ shortcut: null }); + const input = screen.getByTestId('ptt-shortcut-input'); + + // Simulate a real keyboard event — the panel listens for keydown on the + // focused input and captures the key code (e.g. "F13"). Using fireEvent + // because userEvent.keyboard treats F13 as a sequence. + fireEvent.keyDown(input, { key: 'F13', code: 'F13' }); + + const stateAfter = (store.getState() as { ptt: PttState }).ptt; + expect(stateAfter.shortcut).toBe('F13'); + }); + + it('shows the panel title and description from the en locale', () => { + renderPanel({ shortcut: null }); + expect(screen.getByText('Push-to-talk')).toBeInTheDocument(); + expect(screen.getByText(/Hold a key to talk to OpenHuman/i)).toBeInTheDocument(); + }); + + it('renders the localized registration error when the slice has a dictation conflict', () => { + renderPanel({ + shortcut: 'F13', + registrationError: "ptt shortcut 'F13' conflicts with the dictation hotkey", + }); + const errEl = screen.getByTestId('ptt-registration-error'); + expect(errEl).toBeInTheDocument(); + expect(errEl).toHaveTextContent(/already used by dictation/i); + }); + + it('renders a localized Wayland error when the slice has one', () => { + renderPanel({ + shortcut: 'F13', + registrationError: 'global shortcuts are not supported in this Wayland session', + }); + expect(screen.getByTestId('ptt-registration-error')).toHaveTextContent(/wayland/i); + }); + + it('renders the raw error string for unrecognised errors', () => { + renderPanel({ shortcut: 'F13', registrationError: 'some unexpected Tauri error' }); + expect(screen.getByTestId('ptt-registration-error')).toHaveTextContent( + 'some unexpected Tauri error' + ); + }); + + it('does not render a registration error when registrationError is null', () => { + renderPanel({ shortcut: 'F13', registrationError: null }); + expect(screen.queryByTestId('ptt-registration-error')).not.toBeInTheDocument(); + }); + + it('hides the registration error when a captureError (modifier-only) is also present', () => { + // Both errors at once — captureError wins because it's more immediate. + // Trigger captureError by pressing a modifier-only key. + renderPanel({ shortcut: 'F13', registrationError: 'some unexpected Tauri error' }); + const input = screen.getByTestId('ptt-shortcut-input'); + fireEvent.keyDown(input, { key: 'Shift', code: 'ShiftLeft', shiftKey: true }); + // captureError is now set — registration error should be hidden. + expect(screen.queryByTestId('ptt-registration-error')).not.toBeInTheDocument(); + expect(screen.getByTestId('ptt-shortcut-error')).toBeInTheDocument(); + }); +}); diff --git a/app/src/services/__tests__/chatService.test.ts b/app/src/services/__tests__/chatService.test.ts index f5417ccd41..bc3abe4b07 100644 --- a/app/src/services/__tests__/chatService.test.ts +++ b/app/src/services/__tests__/chatService.test.ts @@ -391,4 +391,40 @@ describe('chatService.subscribeChatEvents', () => { }, }); }); + + it('forwards speak_reply, source, session_id when provided', async () => { + const socket = createMockSocket(); + vi.mocked(socketService.getSocket).mockReturnValue(socket as never); + + await chatSend({ + threadId: 'thread-1', + message: 'hello', + speakReply: true, + source: 'ptt', + sessionId: 42, + }); + + expect(mockCallCoreRpc).toHaveBeenCalledWith( + expect.objectContaining({ + method: 'openhuman.channel_web_chat', + params: expect.objectContaining({ + message: 'hello', + speak_reply: true, + source: 'ptt', + session_id: 42, + }), + }) + ); + }); + + it('does not include the new fields when omitted', async () => { + const socket = createMockSocket(); + vi.mocked(socketService.getSocket).mockReturnValue(socket as never); + + await chatSend({ threadId: 'thread-1', message: 'hi' }); + const params = mockCallCoreRpc.mock.calls[0][0].params; + expect(params.speak_reply).toBeUndefined(); + expect(params.source).toBeUndefined(); + expect(params.session_id).toBeUndefined(); + }); }); diff --git a/app/src/services/__tests__/pttService.test.ts b/app/src/services/__tests__/pttService.test.ts new file mode 100644 index 0000000000..55cb62346c --- /dev/null +++ b/app/src/services/__tests__/pttService.test.ts @@ -0,0 +1,190 @@ +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +import { createPttService, type PttDeps } from '../pttService'; + +function makeDeps(overrides: Partial = {}): PttDeps { + return { + audioCapture: { + start: vi.fn().mockResolvedValue(undefined), + finalize: vi.fn().mockResolvedValue({ durationMs: 1500, buffer: new ArrayBuffer(0) }), + cancel: vi.fn().mockResolvedValue(undefined), + }, + transcribe: vi.fn().mockResolvedValue('hello world'), + sendMessage: vi.fn().mockResolvedValue(undefined), + resolveActiveThreadId: vi.fn().mockResolvedValue('thread-active'), + createNewVoiceThread: vi.fn().mockResolvedValue('thread-new'), + playChime: vi.fn().mockResolvedValue(undefined), + showOverlay: vi.fn().mockResolvedValue(undefined), + getSettings: () => ({ speakReplies: true, showOverlay: true }), + now: () => 1_700_000_000_000, + watchdogMs: 10_000, + minAudioMs: 250, + logger: { debug: vi.fn(), info: vi.fn(), warn: vi.fn() }, + ...overrides, + }; +} + +describe('pttService state machine', () => { + beforeEach(() => { + vi.useFakeTimers(); + }); + + it('happy path: start → stop sends the transcript to the active thread with speakReply', async () => { + const deps = makeDeps(); + const svc = createPttService(deps); + + await svc.onStart(1); + expect(deps.audioCapture.start).toHaveBeenCalledWith({ sessionTag: 'ptt:1' }); + expect(deps.playChime).toHaveBeenCalledWith('open'); + expect(deps.showOverlay).toHaveBeenCalledWith(true, 1); + + await svc.onStop(1); + expect(deps.audioCapture.finalize).toHaveBeenCalled(); + expect(deps.playChime).toHaveBeenCalledWith('close'); + expect(deps.showOverlay).toHaveBeenCalledWith(false, 1); + expect(deps.transcribe).toHaveBeenCalled(); + expect(deps.sendMessage).toHaveBeenCalledWith({ + threadId: 'thread-active', + body: 'hello world', + metadata: { source: 'ptt', session_id: 1 }, + speakReply: true, + }); + }); + + it('falls back to a new "Voice" thread when no active thread exists', async () => { + const deps = makeDeps({ resolveActiveThreadId: vi.fn().mockResolvedValue(null) }); + const svc = createPttService(deps); + + await svc.onStart(2); + await svc.onStop(2); + + expect(deps.createNewVoiceThread).toHaveBeenCalled(); + expect(deps.sendMessage).toHaveBeenCalledWith( + expect.objectContaining({ threadId: 'thread-new' }) + ); + }); + + it('drops the session and plays the error chime when audio is shorter than minAudioMs', async () => { + const deps = makeDeps({ + audioCapture: { + start: vi.fn().mockResolvedValue(undefined), + finalize: vi.fn().mockResolvedValue({ durationMs: 100, buffer: new ArrayBuffer(0) }), + cancel: vi.fn().mockResolvedValue(undefined), + }, + }); + const svc = createPttService(deps); + + await svc.onStart(3); + await svc.onStop(3); + + expect(deps.transcribe).not.toHaveBeenCalled(); + expect(deps.sendMessage).not.toHaveBeenCalled(); + expect(deps.playChime).toHaveBeenCalledWith('error'); + }); + + it('drops the session when the transcript is empty', async () => { + const deps = makeDeps({ transcribe: vi.fn().mockResolvedValue(' ') }); + const svc = createPttService(deps); + + await svc.onStart(4); + await svc.onStop(4); + + expect(deps.sendMessage).not.toHaveBeenCalled(); + expect(deps.playChime).toHaveBeenCalledWith('error'); + }); + + it('watchdog finalises the session after watchdogMs even if onStop never arrives', async () => { + const deps = makeDeps(); + const svc = createPttService(deps); + + await svc.onStart(5); + + // Advance fake time past the watchdog. + await vi.advanceTimersByTimeAsync(11_000); + + expect(deps.audioCapture.finalize).toHaveBeenCalled(); + expect(deps.sendMessage).toHaveBeenCalledWith( + expect.objectContaining({ metadata: expect.objectContaining({ session_id: 5 }) }) + ); + }); + + it('second onStart while a session is active preempts the first', async () => { + const deps = makeDeps(); + const svc = createPttService(deps); + + await svc.onStart(6); + await svc.onStart(7); + + expect(deps.audioCapture.cancel).toHaveBeenCalled(); + expect(deps.audioCapture.start).toHaveBeenLastCalledWith({ sessionTag: 'ptt:7' }); + }); + + it('honours the speakReplies setting when forwarding to sendMessage', async () => { + const deps = makeDeps({ getSettings: () => ({ speakReplies: false, showOverlay: true }) }); + const svc = createPttService(deps); + + await svc.onStart(8); + await svc.onStop(8); + + expect(deps.sendMessage).toHaveBeenCalledWith(expect.objectContaining({ speakReply: false })); + }); + + it('mismatched session_id on onStop is ignored', async () => { + const deps = makeDeps(); + const svc = createPttService(deps); + + await svc.onStart(9); + await svc.onStop(999); // stale stop event + + expect(deps.audioCapture.finalize).not.toHaveBeenCalled(); + }); + + it('cancel("user_cancel") aborts an active session without sending a message', async () => { + const deps = makeDeps(); + const svc = createPttService(deps); + + await svc.onStart(10); + await svc.cancel('user_cancel'); + + expect(deps.audioCapture.cancel).toHaveBeenCalled(); + expect(deps.playChime).toHaveBeenCalledWith('error'); + expect(deps.showOverlay).toHaveBeenLastCalledWith(false, 10); + expect(deps.sendMessage).not.toHaveBeenCalled(); + }); + + it('plays error chime and bails if audioCapture.start throws', async () => { + const deps = makeDeps({ + audioCapture: { + start: vi.fn().mockRejectedValue(new Error('mic denied')), + finalize: vi.fn().mockResolvedValue({ durationMs: 1500, buffer: new ArrayBuffer(0) }), + cancel: vi.fn().mockResolvedValue(undefined), + }, + }); + const svc = createPttService(deps); + + await svc.onStart(11); + + expect(deps.playChime).toHaveBeenCalledWith('open'); + expect(deps.playChime).toHaveBeenCalledWith('error'); + expect(deps.showOverlay).toHaveBeenLastCalledWith(false, 11); + // The session never armed — onStop should be a no-op. + await svc.onStop(11); + expect(deps.audioCapture.finalize).not.toHaveBeenCalled(); + expect(deps.sendMessage).not.toHaveBeenCalled(); + }); + + it('posts a "[Voice — transcription failed]" breadcrumb when transcribe throws', async () => { + const deps = makeDeps({ transcribe: vi.fn().mockRejectedValue(new Error('stt timeout')) }); + const svc = createPttService(deps); + + await svc.onStart(12); + await svc.onStop(12); + + expect(deps.sendMessage).toHaveBeenCalledWith( + expect.objectContaining({ + body: '[Voice — transcription failed]', + metadata: { source: 'ptt', session_id: 12 }, + }) + ); + }); +}); diff --git a/app/src/services/chatService.ts b/app/src/services/chatService.ts index 01198515ef..ade6e3e825 100644 --- a/app/src/services/chatService.ts +++ b/app/src/services/chatService.ts @@ -956,6 +956,21 @@ export interface ChatSendParams { * working unchanged. */ locale?: string | null; + /** + * When `true`, the core will synthesize the agent reply via TTS and + * stream audio back (push-to-talk reply flow). + */ + speakReply?: boolean; + /** + * Originating input source — e.g. `'ptt'` for push-to-talk, `'keyboard'` + * for typed input. Forwarded to the core for analytics / routing. + */ + source?: string; + /** + * PTT session ID — ties the chat turn to a specific push-to-talk recording + * session so the core can correlate audio and text events. + */ + sessionId?: number; /** * Queue mode for concurrent messages. When a turn is already in * flight: `steer` injects at the next iteration boundary, `followup` @@ -988,6 +1003,9 @@ export async function chatSend(params: ChatSendParams): Promise { model_override: params.model ?? undefined, profile_id: params.profileId ?? undefined, locale: params.locale ?? undefined, + speak_reply: params.speakReply ?? undefined, + source: params.source ?? undefined, + session_id: params.sessionId ?? undefined, queue_mode: params.queueMode ?? undefined, }, }); diff --git a/app/src/services/pttService.ts b/app/src/services/pttService.ts new file mode 100644 index 0000000000..2782e9e457 --- /dev/null +++ b/app/src/services/pttService.ts @@ -0,0 +1,261 @@ +/** + * pttService — push-to-talk session state machine. + * + * See spec: `docs/superpowers/specs/2026-06-02-global-ptt-design.md` (§ 2, § 3). + * + * Dependency-injected so vitest can exercise the state machine with fake + * audio capture / fake STT / fake sendMessage. Real wiring (subscribing to + * `ptt://*` Tauri events, the real audio_capture, etc.) happens in + * PttHotkeyManager.tsx (T11). + */ + +export type ChimeKind = 'open' | 'close' | 'error'; + +export interface PttSettings { + speakReplies: boolean; + showOverlay: boolean; +} + +export interface FinalizedAudio { + durationMs: number; + buffer: ArrayBuffer; +} + +export interface PttDeps { + audioCapture: { + start(opts: { sessionTag: string }): Promise; + finalize(): Promise; + cancel(): Promise; + }; + transcribe(buf: ArrayBuffer): Promise; + sendMessage(args: { + threadId: string; + body: string; + metadata: { source: 'ptt'; session_id: number }; + speakReply: boolean; + }): Promise; + resolveActiveThreadId(): Promise; + createNewVoiceThread(): Promise; + playChime(kind: ChimeKind): Promise; + showOverlay(active: boolean, sessionId: number): Promise; + getSettings(): PttSettings; + now(): number; + watchdogMs: number; + minAudioMs: number; + logger: { + debug(msg: string, meta?: Record): void; + info(msg: string, meta?: Record): void; + warn(msg: string, meta?: Record): void; + }; +} + +export interface PttService { + onStart(sessionId: number): Promise; + onStop(sessionId: number): Promise; + cancel(reason: 'preempted' | 'mic_failure' | 'user_cancel'): Promise; +} + +interface ActiveSession { + sessionId: number; + startedAtMs: number; + watchdogTimer: ReturnType | null; + finalizedByWatchdog: boolean; +} + +export function createPttService(deps: PttDeps): PttService { + let active: ActiveSession | null = null; + + const armWatchdog = (sessionId: number) => { + const timer = setTimeout(() => { + if (active && active.sessionId === sessionId) { + active.finalizedByWatchdog = true; + deps.logger.warn('[ptt] watchdog fired — finalising session', { sessionId }); + // Fire-and-forget; the watchdog path is the same as a normal stop + // except for the `finalizedByWatchdog` flag (used in logging only). + void finaliseSession(sessionId, /* fromWatchdog */ true); + } + }, deps.watchdogMs); + return timer; + }; + + const finaliseSession = async (sessionId: number, fromWatchdog: boolean) => { + if (!active || active.sessionId !== sessionId) { + // Stale finalisation — ignore. + return; + } + + if (active.watchdogTimer) { + clearTimeout(active.watchdogTimer); + active.watchdogTimer = null; + } + + const settings = deps.getSettings(); + const session = active; + active = null; + + let audio: FinalizedAudio; + try { + audio = await deps.audioCapture.finalize(); + } catch (err) { + deps.logger.warn('[ptt] audio finalize failed', { sessionId, err: String(err) }); + await deps.playChime('error'); + await deps.showOverlay(false, sessionId); + return; + } + + await deps.playChime('close'); + await deps.showOverlay(false, sessionId); + + if (audio.durationMs < deps.minAudioMs) { + deps.logger.info('[ptt] session dropped — audio shorter than minAudioMs', { + sessionId, + durationMs: audio.durationMs, + }); + await deps.playChime('error'); + return; + } + + let text = ''; + try { + text = await deps.transcribe(audio.buffer); + } catch (err) { + deps.logger.warn('[ptt] transcription failed', { sessionId, err: String(err) }); + // Per spec: post the message anyway as a breadcrumb. + text = '[Voice — transcription failed]'; + } + + const trimmed = text.trim(); + + if (!trimmed) { + deps.logger.info('[ptt] session dropped — empty transcript', { sessionId }); + await deps.playChime('error'); + return; + } + + let threadId: string; + try { + const resolved = await deps.resolveActiveThreadId(); + if (!resolved) { + threadId = await deps.createNewVoiceThread(); + } else { + threadId = resolved; + } + } catch (err) { + deps.logger.warn('[ptt] thread resolution failed — aborting commit', { + sessionId, + err: String(err), + }); + await deps.playChime('error'); + return; + } + + try { + await deps.sendMessage({ + threadId, + body: trimmed, + metadata: { source: 'ptt', session_id: sessionId }, + speakReply: settings.speakReplies, + }); + } catch (err) { + deps.logger.warn('[ptt] sendMessage failed', { sessionId, threadId, err: String(err) }); + await deps.playChime('error'); + return; + } + + deps.logger.info('[ptt] session committed', { + sessionId, + threadId, + heldMs: deps.now() - session.startedAtMs, + finalizedByWatchdog: fromWatchdog, + transcriptLen: trimmed.length, + }); + }; + + return { + async onStart(sessionId) { + // Preempt: if another session is active, cancel it. + if (active) { + deps.logger.debug('[ptt] onStart while active — preempting', { + old: active.sessionId, + new: sessionId, + }); + try { + await deps.audioCapture.cancel(); + } catch (err) { + deps.logger.warn('[ptt] cancel failed during preempt', { err: String(err) }); + } + if (active.watchdogTimer) clearTimeout(active.watchdogTimer); + active = null; + } + + // Claim the slot BEFORE any awaits so concurrent onStart calls preempt + // this in-progress session rather than racing with it. + active = { + sessionId, + startedAtMs: deps.now(), + watchdogTimer: null, + finalizedByWatchdog: false, + }; + const claimed = active; + + await deps.playChime('open'); + await deps.showOverlay(true, sessionId); + + // If a concurrent onStart preempted us during the awaits, our claim was + // replaced. Stop here — the new claim owns the slot. + if (active !== claimed) { + return; + } + + try { + await deps.audioCapture.start({ sessionTag: `ptt:${sessionId}` }); + } catch (err) { + deps.logger.warn('[ptt] audio start failed', { sessionId, err: String(err) }); + if (active === claimed) { + active = null; + } + await deps.playChime('error'); + await deps.showOverlay(false, sessionId); + return; + } + + // Re-check after the audio.start await. + if (active !== claimed) { + // Concurrent preempt replaced our claim mid-flight; we already started + // audio for an orphan session. Best-effort cancel and exit — cancellation + // failure here is non-actionable (the orphan session is already detached). + try { + await deps.audioCapture.cancel(); + } catch (_) { + // ignore: orphan-session cleanup is best-effort + } + return; + } + + active.watchdogTimer = armWatchdog(sessionId); + }, + + async onStop(sessionId) { + if (!active || active.sessionId !== sessionId) { + deps.logger.debug('[ptt] stale onStop — ignored', { sessionId }); + return; + } + await finaliseSession(sessionId, /* fromWatchdog */ false); + }, + + async cancel(reason) { + if (!active) return; + deps.logger.info('[ptt] cancel', { sessionId: active.sessionId, reason }); + if (active.watchdogTimer) clearTimeout(active.watchdogTimer); + const session = active; + active = null; + try { + await deps.audioCapture.cancel(); + } catch (err) { + deps.logger.warn('[ptt] cancel: audio cancel failed', { err: String(err) }); + } + await deps.playChime('error'); + await deps.showOverlay(false, session.sessionId); + }, + }; +} diff --git a/app/src/store/__tests__/pttSlice.test.ts b/app/src/store/__tests__/pttSlice.test.ts new file mode 100644 index 0000000000..94ed95f244 --- /dev/null +++ b/app/src/store/__tests__/pttSlice.test.ts @@ -0,0 +1,73 @@ +import { describe, expect, it } from 'vitest'; + +import { + initialPttState, + pttReducer, + type PttState, + setIsHeld, + setPttRegistrationError, + setPttShortcut, + setShowOverlay, + setSpeakReplies, +} from '../pttSlice'; +import { resetUserScopedState } from '../resetActions'; + +describe('ptt slice', () => { + const initial: PttState = { + shortcut: null, + speakReplies: true, + showOverlay: true, + isHeld: false, + registrationError: null, + }; + + it('has the documented default state', () => { + expect(pttReducer(undefined, { type: '@@INIT' })).toEqual(initial); + }); + + it('setPttShortcut stores the shortcut string', () => { + const next = pttReducer(initial, setPttShortcut('F13')); + expect(next.shortcut).toBe('F13'); + }); + + it('setPttShortcut with null clears the shortcut', () => { + const withKey: PttState = { ...initial, shortcut: 'F13' }; + const next = pttReducer(withKey, setPttShortcut(null)); + expect(next.shortcut).toBeNull(); + }); + + it('setSpeakReplies toggles the flag', () => { + expect(pttReducer(initial, setSpeakReplies(false)).speakReplies).toBe(false); + }); + + it('setShowOverlay toggles the flag', () => { + expect(pttReducer(initial, setShowOverlay(false)).showOverlay).toBe(false); + }); + + it('setIsHeld updates the runtime hold flag', () => { + expect(pttReducer(initial, setIsHeld(true)).isHeld).toBe(true); + }); + + it('setPttRegistrationError stores the error string', () => { + const next = pttReducer(initial, setPttRegistrationError('hotkey in use')); + expect(next.registrationError).toBe('hotkey in use'); + }); + + it('setPttRegistrationError with null clears the error', () => { + const withErr: PttState = { ...initial, registrationError: 'some error' }; + const next = pttReducer(withErr, setPttRegistrationError(null)); + expect(next.registrationError).toBeNull(); + }); + + it('resetUserScopedState returns the slice to initial state', () => { + const dirty: PttState = { + shortcut: 'F13', + speakReplies: false, + showOverlay: false, + isHeld: true, + registrationError: 'some error', + }; + const next = pttReducer(dirty, resetUserScopedState()); + expect(next).toEqual(initialPttState); + }); +}); diff --git a/app/src/store/index.ts b/app/src/store/index.ts index 98a01a4095..7df2d9d7b4 100644 --- a/app/src/store/index.ts +++ b/app/src/store/index.ts @@ -31,6 +31,7 @@ import mascotReducer from './mascotSlice'; import notificationReducer from './notificationSlice'; import personaReducer from './personaSlice'; import providerSurfacesReducer from './providerSurfaceSlice'; +import { pttReducer } from './pttSlice'; import socketReducer from './socketSlice'; import themeReducer from './themeSlice'; import threadReducer from './threadSlice'; @@ -160,6 +161,17 @@ const persistedMascotReducer = persistReducer(mascotPersistConfig, mascotReducer const personaPersistConfig = { key: 'persona', storage, whitelist: ['displayName', 'description'] }; const persistedPersonaReducer = persistReducer(personaPersistConfig, personaReducer); +// PTT (Push-to-Talk): persist the hotkey binding and session preferences. +// `isHeld` is a runtime-only flag — deliberately excluded from the whitelist so +// a crash or force-quit can never leave the app stuck in the "held" state. +// The boot hook (T11) also explicitly resets it to false on mount. +const pttPersistConfig = { + key: 'ptt', + storage, + whitelist: ['shortcut', 'speakReplies', 'showOverlay'], +}; +const persistedPttReducer = persistReducer(pttPersistConfig, pttReducer); + // chatRuntime is mostly ephemeral (streaming buffers, tool timelines, // inference status) — those MUST NOT survive a restart or the UI tries // to resume a turn whose live driver has gone. The single exception is @@ -204,6 +216,7 @@ export const store = configureStore({ mascot: persistedMascotReducer, persona: persistedPersonaReducer, theme: persistedThemeReducer, + ptt: persistedPttReducer, }, middleware: getDefaultMiddleware => { const middleware = getDefaultMiddleware({ diff --git a/app/src/store/pttSlice.ts b/app/src/store/pttSlice.ts new file mode 100644 index 0000000000..c7038bf879 --- /dev/null +++ b/app/src/store/pttSlice.ts @@ -0,0 +1,83 @@ +import { createSlice, type PayloadAction } from '@reduxjs/toolkit'; + +import { resetUserScopedState } from './resetActions'; + +/** + * PTT (Push-to-Talk) slice — persisted hotkey binding + session settings, + * plus non-persisted runtime flags: + * - `isHeld`: tracks whether the PTT key is currently held. The boot hook + * (Task 11) resets it to false on mount so a stale rehydrated value can + * never leave the app stuck in "held" mode. + * - `registrationError`: the most recent error from `register_ptt_hotkey`, + * surfaced in PttSettingsPanel (T13). Cleared on successful register. + * Transient — not persisted across sessions. + */ + +export interface PttState { + /** Currently-bound PTT hotkey string (e.g. "F13" or "Ctrl+Alt+T"). null = unbound. */ + shortcut: string | null; + /** When true, the agent's reply is spoken via TTS. */ + speakReplies: boolean; + /** When true, the overlay window is shown during a PTT session. */ + showOverlay: boolean; + /** Non-persisted runtime flag: is the PTT key currently held? */ + isHeld: boolean; + /** Last error from register_ptt_hotkey, surfaced in PttSettingsPanel. Cleared on successful register. */ + registrationError: string | null; +} + +export const initialPttState: PttState = { + shortcut: null, + speakReplies: true, + showOverlay: true, + isHeld: false, + registrationError: null, +}; + +const pttSlice = createSlice({ + name: 'ptt', + initialState: initialPttState, + reducers: { + setPttShortcut(state, action: PayloadAction) { + state.shortcut = action.payload; + }, + setSpeakReplies(state, action: PayloadAction) { + state.speakReplies = action.payload; + }, + setShowOverlay(state, action: PayloadAction) { + state.showOverlay = action.payload; + }, + setIsHeld(state, action: PayloadAction) { + state.isHeld = action.payload; + }, + setPttRegistrationError(state, action: PayloadAction) { + state.registrationError = action.payload; + }, + }, + extraReducers: builder => { + builder.addCase(resetUserScopedState, () => initialPttState); + }, +}); + +export const { + setPttShortcut, + setSpeakReplies, + setShowOverlay, + setIsHeld, + setPttRegistrationError, +} = pttSlice.actions; + +// ── Selectors ──────────────────────────────────────────────────────────────── + +export const selectPttShortcut = (state: { ptt: PttState }): string | null => state.ptt.shortcut; + +export const selectSpeakReplies = (state: { ptt: PttState }): boolean => state.ptt.speakReplies; + +export const selectShowOverlay = (state: { ptt: PttState }): boolean => state.ptt.showOverlay; + +export const selectIsHeld = (state: { ptt: PttState }): boolean => state.ptt.isHeld; + +export const selectPttRegistrationError = (state: { ptt: PttState }): string | null => + state.ptt.registrationError; + +export const pttReducer = pttSlice.reducer; diff --git a/app/src/test/test-utils.tsx b/app/src/test/test-utils.tsx index a5d6f5baa9..4cafa2f7e9 100644 --- a/app/src/test/test-utils.tsx +++ b/app/src/test/test-utils.tsx @@ -18,6 +18,7 @@ import coreModeReducer from '../store/coreModeSlice'; import localeReducer from '../store/localeSlice'; import mascotReducer from '../store/mascotSlice'; import personaReducer from '../store/personaSlice'; +import { pttReducer } from '../store/pttSlice'; import socketReducer from '../store/socketSlice'; import themeReducer from '../store/themeSlice'; @@ -41,6 +42,7 @@ const testRootReducer = combineReducers({ locale: localeReducer, mascot: mascotReducer, persona: personaReducer, + ptt: pttReducer, socket: socketReducer, theme: themeReducer, }); diff --git a/app/src/utils/tauriCommands/ptt.ts b/app/src/utils/tauriCommands/ptt.ts new file mode 100644 index 0000000000..981c56c9eb --- /dev/null +++ b/app/src/utils/tauriCommands/ptt.ts @@ -0,0 +1,44 @@ +/** + * Push-to-talk (PTT) Tauri command wrappers. + */ +import { invoke } from '@tauri-apps/api/core'; + +import { isTauri } from './common'; + +/** + * Register (or re-register) the global push-to-talk hotkey. + */ +export async function registerPttHotkey(shortcut: string): Promise { + if (!isTauri()) { + console.debug('[ptt] registerPttHotkey: skipped — not running in Tauri'); + return; + } + console.debug('[ptt] registerPttHotkey: shortcut=%s', shortcut); + await invoke('register_ptt_hotkey', { shortcut }); + console.debug('[ptt] registerPttHotkey: done'); +} + +/** + * Unregister the global push-to-talk hotkey. + */ +export async function unregisterPttHotkey(): Promise { + if (!isTauri()) { + console.debug('[ptt] unregisterPttHotkey: skipped — not running in Tauri'); + return; + } + console.debug('[ptt] unregisterPttHotkey: invoking'); + await invoke('unregister_ptt_hotkey'); + console.debug('[ptt] unregisterPttHotkey: done'); +} + +/** + * Show or hide the PTT overlay window. + */ +export async function showPttOverlay(active: boolean, sessionId: number): Promise { + if (!isTauri()) { + console.debug('[ptt] showPttOverlay: skipped — not running in Tauri'); + return; + } + console.debug('[ptt] showPttOverlay: active=%s sessionId=%d', active, sessionId); + await invoke('show_ptt_overlay', { active, sessionId }); +} diff --git a/app/test/e2e/specs/ptt-flow.spec.ts b/app/test/e2e/specs/ptt-flow.spec.ts new file mode 100644 index 0000000000..86116efff1 --- /dev/null +++ b/app/test/e2e/specs/ptt-flow.spec.ts @@ -0,0 +1,619 @@ +// @ts-nocheck +/** + * E2E: global push-to-talk (PTT) end-to-end flow with mocked STT. + * + * Task 14 from `docs/superpowers/plans/2026-06-02-global-ptt.md`. + * + * What this spec exercises (top to bottom): + * + * UI: + * 1. Navigate to /settings/voice → PttSettingsPanel mounts (data-testid + * "ptt-settings-panel"). + * 2. Programmatically dispatch `setPttShortcut('F13')` against the exposed + * Redux store to simulate the user binding a hotkey. Using a Redux + * dispatch (rather than driving the readonly capture input via + * chromedriver) sidesteps two fragile layers: + * a. The keyboard-capture input intercepts native keydown events + * that CDP would otherwise inject into the textarea. + * b. F13 is reliably passable through chromedriver to a generic + * input but the panel-level interception logic is unit-tested + * elsewhere (PttSettingsPanel.test.tsx). We test the *binding + * effect*, not the capture UX. + * 3. Assert `usePttHotkey` reacts and Redux state settles with a non-null + * shortcut. Registration may succeed (no error) or fail with a non- + * empty error string on headless Linux runners with no real keyboard + * — both are acceptable signals that the binding path was driven; we + * log the failure for follow-up but don't make CI red on it. + * + * PTT session: + * 4. Mock navigator.mediaDevices.getUserMedia + MediaRecorder so the + * renderer-side audio capture (pttAudio.ts) can run without a real + * microphone (headless CEF has no audio device). + * 5. Configure the mock backend (audioTranscriptionText) so the core's + * cloud STT path returns a known transcript "hello from PTT". + * 6. Simulate the hotkey hold by emitting `ptt://start`/`ptt://stop` via + * Tauri's internal event plugin (`__TAURI_INTERNALS__.invoke('plugin: + * event|emit', ...)`). This is the same path `@tauri-apps/api/event`'s + * `emit()` uses; we go through the internal because direct dynamic + * imports of `@tauri-apps/api/event` don't resolve under Chromium- + * driver (see core-rpc.ts). + * 7. Wait long enough between start/stop (≥ 250 ms — pttService's + * `minAudioMs`) so the recording isn't dropped as an accidental tap. + * + * Assertions: + * 8. The overlay window is created (window-handle count went from 1 → + * 2 when register_ptt_hotkey called ptt_overlay::ensure_window). + * 9. The transcribed text appears as a user message in the chat thread. + * 10. The core_rpc_relay invocation for `channel_web_chat` carried + * `speak_reply: true` (the user's PTT setting was honoured on the + * wire). We spy on `__TAURI_INTERNALS__.invoke` before the press to + * capture the call payload. + * + * Plan: docs/superpowers/plans/2026-06-02-global-ptt.md (Task 14). + * Spec: docs/superpowers/specs/2026-06-02-global-ptt-design.md. + * + * Limitations / notes for follow-up sessions: + * - The OS-level global-shortcut emit can't be triggered by the Chromium + * driver (CDP injects events into the renderer, not the OS keyboard + * subsystem). Step 6 above is the correct workaround in a unit-test + * sense, but it does not exercise the rdev → tauri global-shortcut + * pipeline on the way in. That layer is covered by Rust unit tests + * in `ptt_hotkeys.rs` and integration coverage in PttHotkeyManager + * tests. + * - MediaRecorder availability under CEF headless: present but won't + * produce real opus frames. We mock it entirely so the buffer reaches + * the transcribe RPC as a zero-byte blob; the mock backend doesn't care + * about the actual audio bytes (it just returns the configured + * transcript text). + */ +import { waitForApp } from '../helpers/app-helpers'; +import { + getSelectedThreadId, + waitForAssistantReplyContaining, + waitForSocketConnected, +} from '../helpers/chat-harness'; +import { callOpenhumanRpc } from '../helpers/core-rpc'; +import { textExists } from '../helpers/element-helpers'; +import { resetApp } from '../helpers/reset-app'; +import { navigateViaHash } from '../helpers/shared-flows'; +import { + clearRequestLog, + getRequestLog, + setMockBehavior, + startMockServer, + stopMockServer, +} from '../mock-server'; + +const USER_ID = 'e2e-ptt-flow'; +const SHORTCUT = 'F13'; +const STT_TRANSCRIPT = 'hello from PTT'; + +const OVERLAY_WINDOW_LABEL = 'ptt-overlay'; +// pttService.minAudioMs is 250; we hold for 800 ms to be comfortably above the +// floor and tolerant of slow CI scheduling. +const HOLD_DURATION_MS = 800; + +describe('PTT — global push-to-talk flow', function () { + this.timeout(180_000); + + before(async function beforeSuite() { + this.timeout(120_000); + await startMockServer(); + await waitForApp(); + await resetApp(USER_ID); + + // The cloud STT path goes through /openai/v1/audio/transcriptions in the + // mock backend; set the deterministic transcript before any PTT press. + setMockBehavior('audioTranscriptionText', STT_TRANSCRIPT); + }); + + after(async () => { + setMockBehavior('audioTranscriptionText', ''); + await stopMockServer(); + }); + + // --------------------------------------------------------------------------- + // Step 1: settings → voice → PttSettingsPanel. + // --------------------------------------------------------------------------- + it('renders the PTT settings panel under /settings/voice', async () => { + await navigateViaHash('/settings/voice'); + + // The panel may take a beat to mount as VoicePanel hydrates its providers. + const panel = await browser.$('[data-testid="ptt-settings-panel"]'); + await panel.waitForExist({ + timeout: 20_000, + timeoutMsg: 'ptt-settings-panel did not mount under /settings/voice', + }); + + // The hotkey input + the two switches must all be present (T13 contract). + const shortcutInput = await browser.$('[data-testid="ptt-shortcut-input"]'); + await shortcutInput.waitForExist({ timeout: 5_000 }); + const speakSwitch = await browser.$('[data-testid="ptt-speak-replies-switch"]'); + await speakSwitch.waitForExist({ timeout: 5_000 }); + const overlaySwitch = await browser.$('[data-testid="ptt-show-overlay-switch"]'); + await overlaySwitch.waitForExist({ timeout: 5_000 }); + }); + + // --------------------------------------------------------------------------- + // Step 2 + 3: bind the shortcut, observe Redux + register_ptt_hotkey. + // + // We drive Redux directly. The shortcut-capture input is exhaustively + // covered by PttSettingsPanel.test.tsx; here we test the *binding effect* + // — that setting the shortcut triggers the manager hook which calls + // register_ptt_hotkey in the Tauri shell. + // --------------------------------------------------------------------------- + it('binds the F13 hotkey via Redux + the manager hook forwards to the Tauri shell', async () => { + // Sanity: store handle is exposed (gated on E2E build flag). + const storePresent = await browser.execute( + () => + typeof (window as unknown as { __OPENHUMAN_STORE__?: unknown }).__OPENHUMAN_STORE__ !== + 'undefined' + ); + expect(storePresent).toBe(true); + + // Speak replies must be true so the chat-send carries speak_reply: true. + // showOverlay must be true so the manager invokes show_ptt_overlay on + // the start edge (overlay window check below depends on it). + await browser.execute(() => { + const store = ( + window as unknown as { + __OPENHUMAN_STORE__: { dispatch: (a: { type: string; payload: unknown }) => unknown }; + } + ).__OPENHUMAN_STORE__; + store.dispatch({ type: 'ptt/setSpeakReplies', payload: true }); + store.dispatch({ type: 'ptt/setShowOverlay', payload: true }); + }); + + // Dispatch the binding. + await browser.execute((shortcut: string) => { + const store = ( + window as unknown as { + __OPENHUMAN_STORE__: { dispatch: (a: { type: string; payload: string }) => unknown }; + } + ).__OPENHUMAN_STORE__; + store.dispatch({ type: 'ptt/setPttShortcut', payload: shortcut }); + }, SHORTCUT); + + // Wait until the slice settles with the bound shortcut. + await browser.waitUntil( + async () => { + return ( + (await browser.execute(() => { + const state = ( + window as unknown as { + __OPENHUMAN_STORE__: { getState: () => { ptt?: { shortcut?: string | null } } }; + } + ).__OPENHUMAN_STORE__.getState(); + return state.ptt?.shortcut ?? null; + })) === SHORTCUT + ); + }, + { timeout: 5_000, timeoutMsg: 'ptt.shortcut never settled to F13' } + ); + + // Give usePttHotkey a beat to call register_ptt_hotkey, then read the + // registration-error slice. A null (or empty) error means the Tauri + // shell registered the OS shortcut successfully. A non-null error is + // acceptable in headless Linux containers where the global-shortcut + // plugin can't talk to a real X11 / Wayland socket — we log and + // continue rather than fail the spec on env-specific gaps. + await browser.pause(2_000); + const registrationError = await browser.execute(() => { + const state = ( + window as unknown as { + __OPENHUMAN_STORE__: { getState: () => { ptt?: { registrationError?: string | null } } }; + } + ).__OPENHUMAN_STORE__.getState(); + return state.ptt?.registrationError ?? null; + }); + if (registrationError) { + console.warn( + `[ptt-flow] register_ptt_hotkey returned error in this environment: ${registrationError}. ` + + 'Continuing — the binding-side wiring was driven and the failure is the OS shortcut path.' + ); + } else { + console.log('[ptt-flow] register_ptt_hotkey succeeded — overlay window should now exist'); + } + }); + + // --------------------------------------------------------------------------- + // Step 8: overlay window is created lazily by register_ptt_hotkey. + // + // We check getWindowHandles. The handle count goes from 1 (main app) → + // 2 (main + ptt-overlay) once ensure_window has run. We tolerate either + // outcome: if the OS shortcut failed earlier (headless container), the + // overlay might still be created (ensure_window is best-effort and runs + // before the shortcut registration), but we don't *require* it to assert + // success. + // --------------------------------------------------------------------------- + it('lazy-creates the overlay webview window once the hotkey is bound', async () => { + // Poll briefly — window creation is async after register_ptt_hotkey returns. + const deadline = Date.now() + 10_000; + let handles: string[] = []; + while (Date.now() < deadline) { + handles = await browser.getWindowHandles(); + if (handles.length >= 2) break; + await browser.pause(300); + } + console.log(`[ptt-flow] window handles after bind: ${handles.length}`); + if (handles.length < 2) { + console.warn( + '[ptt-flow] overlay window did not appear — likely register_ptt_hotkey failed on this OS ' + + '(see registrationError log above). Skipping overlay-window assertion.' + ); + return; + } + // Confirm at least one of the new handles loads the ptt-overlay route. + const mainHandle = await browser.getWindowHandle(); + let foundOverlay = false; + for (const handle of handles) { + if (handle === mainHandle) continue; + try { + await browser.switchToWindow(handle); + const url = await browser.getUrl(); + console.log(`[ptt-flow] inspecting non-main window: ${url}`); + if (url.includes('ptt-overlay') || url.includes(OVERLAY_WINDOW_LABEL)) { + foundOverlay = true; + break; + } + } catch (err) { + console.warn('[ptt-flow] switchToWindow threw — continuing', err); + } + } + // Switch back to the main window before the next test runs. + try { + await browser.switchToWindow(mainHandle); + } catch (err) { + console.warn('[ptt-flow] could not switch back to main window', err); + } + expect(foundOverlay).toBe(true); + }); + + // --------------------------------------------------------------------------- + // Step 4–7 + 9–10: simulate the hold, observe the commit. + // --------------------------------------------------------------------------- + it('simulates a PTT hold, captures audio, transcribes via mock, sends with speak_reply: true', async function () { + this.timeout(120_000); + + // Make sure the user is signed in + the socket is connected so the + // channel_web_chat RPC has a real client_id to route on. + const socketReady = await waitForSocketConnected(30_000); + if (!socketReady) { + console.warn('[ptt-flow] socket did not connect within 30s — chat send may fail'); + } + + // Navigate to /chat so the chat runtime is hydrated and we land on a + // resolvable thread. pttThread.ts will resolve the active thread or + // create one as needed; this just makes the assertion at step 9 + // easier (we can read selectedThreadId and assert message presence). + await navigateViaHash('/chat'); + await browser.waitUntil(async () => await textExists('Threads'), { + timeout: 15_000, + timeoutMsg: 'Conversations did not mount under /chat', + }); + + // ------------------------------------------------------------------------- + // 4a. Mock getUserMedia + MediaRecorder so pttAudio.ts succeeds. + // + // We replace getUserMedia with a fake that returns a MediaStream-shaped + // object; we replace MediaRecorder with a minimal stub that fires + // 'dataavailable' (empty Blob) and 'stop' synchronously when .stop() is + // called. The audio buffer ends up zero-byte — the mock STT endpoint + // returns the fixed transcript regardless. + // ------------------------------------------------------------------------- + await browser.execute(() => { + const w = window as unknown as Record; + w.__e2e_ptt_real_gum = navigator.mediaDevices?.getUserMedia?.bind(navigator.mediaDevices); + w.__e2e_ptt_real_mr = (window as unknown as { MediaRecorder?: unknown }).MediaRecorder; + + class FakeMediaRecorder { + public state: 'inactive' | 'recording' = 'inactive'; + public mimeType: string; + private listeners = new Map void>>(); + constructor(_stream: unknown, opts?: { mimeType?: string }) { + this.mimeType = opts?.mimeType || 'audio/webm;codecs=opus'; + } + static isTypeSupported(_mime: string): boolean { + return true; + } + addEventListener(type: string, fn: (e: unknown) => void): void { + if (!this.listeners.has(type)) this.listeners.set(type, new Set()); + this.listeners.get(type)!.add(fn); + } + removeEventListener(type: string, fn: (e: unknown) => void): void { + this.listeners.get(type)?.delete(fn); + } + dispatchEvent(type: string, payload: unknown): void { + const set = this.listeners.get(type); + if (!set) return; + for (const fn of set) { + try { + fn(payload); + } catch (err) { + // swallow — listener failures shouldn't break the test + console.warn('[e2e-ptt-mock] listener threw', err); + } + } + } + start(): void { + this.state = 'recording'; + } + stop(): void { + // Emit a tiny synthetic chunk + a stop event. pttAudio expects + // dataavailable with .data:Blob and then stop. + const blob = new Blob([new Uint8Array(8)], { type: this.mimeType }); + this.dispatchEvent('dataavailable', { data: blob }); + this.state = 'inactive'; + this.dispatchEvent('stop', new Event('stop')); + } + } + + const fakeStream = { + getTracks: () => [ + { + stop() { + /* noop */ + }, + kind: 'audio' as const, + }, + ], + }; + + Object.defineProperty(navigator, 'mediaDevices', { + configurable: true, + value: { + ...(navigator.mediaDevices || {}), + getUserMedia: () => Promise.resolve(fakeStream as unknown as MediaStream), + }, + }); + (window as unknown as { MediaRecorder: unknown }).MediaRecorder = + FakeMediaRecorder as unknown; + }); + + // ------------------------------------------------------------------------- + // 10a. Spy on Tauri invocations so we can capture the channel_web_chat + // payload and assert speak_reply: true was forwarded on the wire. + // + // __TAURI_INTERNALS__.invoke is the underlying channel every Tauri + // command (and `core_rpc_relay`) flows through. We wrap it to push + // relay calls into a module-window-scoped list. + // ------------------------------------------------------------------------- + await browser.execute(() => { + const w = window as unknown as { + __TAURI_INTERNALS__?: { + invoke?: (...args: unknown[]) => Promise; + [k: string]: unknown; + }; + __e2e_ptt_relay_calls?: Array<{ cmd: string; args: unknown }>; + __e2e_ptt_real_invoke?: (...args: unknown[]) => Promise; + }; + if (!w.__TAURI_INTERNALS__ || typeof w.__TAURI_INTERNALS__.invoke !== 'function') { + console.warn('[e2e-ptt-spy] __TAURI_INTERNALS__.invoke missing — spy not installed'); + return; + } + w.__e2e_ptt_relay_calls = []; + w.__e2e_ptt_real_invoke = w.__TAURI_INTERNALS__.invoke; + const original = w.__e2e_ptt_real_invoke; + w.__TAURI_INTERNALS__.invoke = async function spied( + cmd: string, + args?: unknown, + ...rest: unknown[] + ): Promise { + try { + if (cmd === 'core_rpc_relay') { + w.__e2e_ptt_relay_calls!.push({ cmd, args }); + } + } catch { + /* ignore */ + } + // Forward to the original implementation, preserving binding. + return (original as Function).call(w.__TAURI_INTERNALS__, cmd, args, ...rest); + }; + }); + + clearRequestLog(); + const threadIdBefore = await getSelectedThreadId(); + console.log(`[ptt-flow] selectedThreadId before press: ${threadIdBefore}`); + + // ------------------------------------------------------------------------- + // 6. Simulate the hotkey hold by emitting ptt://start and ptt://stop + // via Tauri's internal event plugin. PttHotkeyManager's listen() + // handlers pick these up and drive pttService through onStart/onStop. + // ------------------------------------------------------------------------- + const sessionId = 1; + const emitOk = await browser.execute( + async ({ event, payloadJson }) => { + const w = window as unknown as { + __TAURI_INTERNALS__?: { invoke?: (...args: unknown[]) => Promise }; + }; + const invoke = w.__TAURI_INTERNALS__?.invoke; + if (!invoke) return { ok: false, err: 'no __TAURI_INTERNALS__.invoke' }; + try { + // plugin:event|emit accepts a JSON-string payload for arbitrary + // event types (the listener side is generic-typed). + await invoke('plugin:event|emit', { event, payload: payloadJson }); + return { ok: true }; + } catch (e) { + return { ok: false, err: e instanceof Error ? e.message : String(e) }; + } + }, + { event: 'ptt://start', payloadJson: JSON.stringify({ session_id: sessionId }) } + ); + if (!emitOk?.ok) { + console.warn(`[ptt-flow] emit ptt://start failed: ${emitOk?.err}`); + } + + // Hold for HOLD_DURATION_MS so the recording isn't dropped as a tap. + await browser.pause(HOLD_DURATION_MS); + + const stopOk = await browser.execute( + async ({ event, payloadJson }) => { + const w = window as unknown as { + __TAURI_INTERNALS__?: { invoke?: (...args: unknown[]) => Promise }; + }; + const invoke = w.__TAURI_INTERNALS__?.invoke; + if (!invoke) return { ok: false, err: 'no __TAURI_INTERNALS__.invoke' }; + try { + await invoke('plugin:event|emit', { event, payload: payloadJson }); + return { ok: true }; + } catch (e) { + return { ok: false, err: e instanceof Error ? e.message : String(e) }; + } + }, + { event: 'ptt://stop', payloadJson: JSON.stringify({ session_id: sessionId }) } + ); + if (!stopOk?.ok) { + console.warn(`[ptt-flow] emit ptt://stop failed: ${stopOk?.err}`); + } + + // ------------------------------------------------------------------------- + // 9. The transcript should appear as a user message in the chat thread. + // ------------------------------------------------------------------------- + const sawTranscript = await waitForAssistantReplyContaining(STT_TRANSCRIPT, { + timeoutMs: 30_000, + logPrefix: '[ptt-flow]', + }); + if (!sawTranscript) { + console.warn( + `[ptt-flow] transcript "${STT_TRANSCRIPT}" did not appear in DOM — ` + + 'this is often caused by getUserMedia mock injection failing under headless CEF, ' + + 'or by register_ptt_hotkey having failed earlier so pttService never received ptt://start.' + ); + } + + // ------------------------------------------------------------------------- + // 10b. Assert at least one core_rpc_relay invocation included + // method: 'openhuman.channel_web_chat' with speak_reply: true. + // ------------------------------------------------------------------------- + const relayCalls = (await browser.execute(() => { + return (window as unknown as { __e2e_ptt_relay_calls?: unknown[] }).__e2e_ptt_relay_calls; + })) as Array<{ cmd: string; args: unknown }> | undefined; + console.log(`[ptt-flow] captured ${relayCalls?.length ?? 0} core_rpc_relay invocations`); + + let sawSpeakReplyChat = false; + for (const call of relayCalls ?? []) { + try { + // Tauri's invoke signature is (cmd, args) where args is a record. + // For core_rpc_relay the renderer passes either a record like + // { method, params, body } or a single string — we coerce robustly. + const args = call.args as Record | undefined; + const payload = args && typeof args === 'object' ? JSON.stringify(args) : String(args); + if ( + payload.includes('openhuman.channel_web_chat') && + payload.includes('"speak_reply":true') + ) { + sawSpeakReplyChat = true; + break; + } + } catch { + /* ignore non-stringifiable payloads */ + } + } + if (!sawSpeakReplyChat) { + console.warn( + '[ptt-flow] did not observe a channel_web_chat call with speak_reply:true. ' + + 'Dumping the captured payloads for diagnosis:\n' + + JSON.stringify(relayCalls ?? [], null, 2).slice(0, 4_000) + ); + } + + // Restore the spy + getUserMedia/MediaRecorder so any later spec in the + // session sees a clean window. + await browser.execute(() => { + const w = window as unknown as { + __TAURI_INTERNALS__?: { invoke?: unknown }; + __e2e_ptt_real_invoke?: unknown; + __e2e_ptt_real_gum?: unknown; + __e2e_ptt_real_mr?: unknown; + }; + if (w.__TAURI_INTERNALS__ && w.__e2e_ptt_real_invoke) { + w.__TAURI_INTERNALS__.invoke = w.__e2e_ptt_real_invoke; + } + if (w.__e2e_ptt_real_gum && navigator.mediaDevices) { + Object.defineProperty(navigator.mediaDevices, 'getUserMedia', { + configurable: true, + value: w.__e2e_ptt_real_gum, + }); + } + if (w.__e2e_ptt_real_mr) { + (window as unknown as { MediaRecorder: unknown }).MediaRecorder = w.__e2e_ptt_real_mr; + } + delete (w as Record).__e2e_ptt_relay_calls; + delete (w as Record).__e2e_ptt_real_invoke; + delete (w as Record).__e2e_ptt_real_gum; + delete (w as Record).__e2e_ptt_real_mr; + }); + + // Soft-assert: in a fully green environment both flags are true. We + // expect both, but the warnings above explain the env paths where one + // might come back false. Asserting hard would gate CI on shaky pieces. + expect(sawTranscript).toBe(true); + expect(sawSpeakReplyChat).toBe(true); + }); + + // --------------------------------------------------------------------------- + // Step 5 corroboration: the mock STT endpoint was hit. + // + // We assert the request log contains a POST to + // /openai/v1/audio/transcriptions. This is independent of the spy above — + // it confirms the audio bytes actually traversed the Rust STT pipeline + // (voice_transcribe_bytes RPC → cloud provider → mock). + // --------------------------------------------------------------------------- + it('the mock backend received the audio-transcriptions request', async () => { + const log = getRequestLog() as Array<{ method: string; url: string }>; + const sttCalls = log.filter( + r => r.method === 'POST' && r.url.includes('/openai/v1/audio/transcriptions') + ); + console.log(`[ptt-flow] /openai/v1/audio/transcriptions calls observed: ${sttCalls.length}`); + // The earlier "PTT session" test logs a warning rather than failing if the + // OS shortcut couldn't register. In that case the audio path may never + // have triggered — log and move on rather than make CI red on env gaps. + if (sttCalls.length === 0) { + console.warn( + '[ptt-flow] no audio-transcriptions calls observed. ' + + 'Most likely cause: the renderer-side audio capture mock or the ptt://start emit ' + + 'did not fully exercise the pttService path. The earlier in-flight steps log ' + + 'their specific failures.' + ); + } + expect(sttCalls.length).toBeGreaterThan(0); + }); + + // --------------------------------------------------------------------------- + // Optional sanity: the conversation persists with the transcript text. + // + // Uses the same test_support_read_workspace_file mechanism as the chat- + // harness specs (see chat-harness-send-stream.spec.ts). + // --------------------------------------------------------------------------- + it('the chat thread JSONL contains the transcribed text on disk', async () => { + const threadId = await getSelectedThreadId(); + if (typeof threadId !== 'string' || threadId.length === 0) { + console.warn('[ptt-flow] no selectedThreadId after press — skipping JSONL check'); + return; + } + const hex = Array.from(new TextEncoder().encode(threadId)) + .map(b => b.toString(16).padStart(2, '0')) + .join(''); + const relPath = `memory/conversations/threads/${hex}.jsonl`; + let content = ''; + const deadline = Date.now() + 10_000; + while (Date.now() < deadline) { + const read = await callOpenhumanRpc<{ result: { content_utf8: string } }>( + 'openhuman.test_support_read_workspace_file', + { rel_path: relPath, max_bytes: 65_536 } + ); + if (read.ok && read.result?.result?.content_utf8) { + content = read.result.result.content_utf8; + if (content.includes(STT_TRANSCRIPT)) break; + } + await browser.pause(300); + } + if (!content.includes(STT_TRANSCRIPT)) { + console.warn( + `[ptt-flow] thread JSONL did not contain "${STT_TRANSCRIPT}". This corroborates ` + + 'an earlier failure in the press path; the earlier `it` logs the specific cause.' + ); + } + expect(content).toContain(STT_TRANSCRIPT); + }); +}); diff --git a/docs/superpowers/plans/2026-06-02-global-ptt.md b/docs/superpowers/plans/2026-06-02-global-ptt.md new file mode 100644 index 0000000000..1516c90c4d --- /dev/null +++ b/docs/superpowers/plans/2026-06-02-global-ptt.md @@ -0,0 +1,3027 @@ +# Global Push-to-Talk Hotkey Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add a configurable hold-to-talk global hotkey that lets the user dictate to OpenHuman while it's in the background, with the agent's reply spoken back via TTS — no window focus stealing at any point. + +**Architecture:** +- **Tauri shell** owns the global hotkey + the always-on-top overlay window. Uses `tauri-plugin-global-shortcut` uniformly across macOS / Windows / Linux (single code path — *different* from dictation's OS-forked rdev/Tauri-plugin dual-path, which is grandfathered legacy + a macOS-26 rdev crash workaround). +- **Frontend service `pttService`** owns the press → capture → finalize → STT → send → TTS state machine, with a 10s watchdog for swallowed `Released` events. +- **Rust core** gets one additive change: three optional fields on `channel.web_chat` (`speak_reply`, `source`, `session_id`). When `speak_reply` is true, the existing progress bridge calls `voice::reply_speech::synthesize_and_play(final_text)` on `TurnCompleted`. + +**Tech Stack:** +- Rust core, Tauri shell (`tauri-plugin-global-shortcut`), React + Redux Toolkit + redux-persist, Vitest, WDIO/Appium for E2E, i18n via the project's `useT()` infrastructure. + +**Spec:** [`docs/superpowers/specs/2026-06-02-global-ptt-design.md`](../specs/2026-06-02-global-ptt-design.md) + +**Issue:** [tinyhumansai/openhuman#3090](https://github.com/tinyhumansai/openhuman/issues/3090) — push-to-talk half only; background screen capture is a follow-up PR. + +--- + +## File map + +| Layer | File | Action | Purpose | +| --- | --- | --- | --- | +| Tauri shell | `app/src-tauri/src/ptt_hotkeys.rs` | create | Hotkey registration + state (`PttHotkeyState`, `expand_ptt_shortcuts`, `PttError`). | +| Tauri shell | `app/src-tauri/src/ptt_overlay.rs` | create | Lazy borderless always-on-top overlay window + `show_ptt_overlay` IPC. | +| Tauri shell | `app/src-tauri/src/lib.rs` | modify | Two new IPC commands; wire `PttHotkeyState` into `.manage(...)`; conflict check vs dictation. | +| Rust core | `src/openhuman/channels/providers/web.rs` | modify | Add `speak_reply`/`source`/`session_id` to schema + plumb to progress bridge. | +| Rust core | `src/openhuman/channels/providers/web_tests.rs` | modify | Schema-roundtrip + default-omitted tests. | +| Rust core | `src/openhuman/voice/bus.rs` | create | `VoiceEvent::PttTranscriptCommitted` publish helper. | +| Rust core | `src/openhuman/voice/mod.rs` | modify | `pub mod bus;`. | +| Rust core | `src/core/event_bus/events.rs` | modify | `DomainEvent::Voice(VoiceEvent)` + `VoiceEvent` enum + domain mapping. | +| Rust core | `src/openhuman/about_app/` (capability list) | modify | Add `voice.ptt` capability entry. | +| Rust core | `tests/json_rpc_e2e.rs` | modify | E2E asserting `reply_speech` is invoked on `speak_reply=true` | +| Frontend | `app/src/services/pttService.ts` | create | Press/release state machine + watchdog + glue. | +| Frontend | `app/src/services/__tests__/pttService.test.ts` | create | State-machine unit tests. | +| Frontend | `app/src/services/chatService.ts` | modify | Forward `speak_reply` / `source` / `session_id` to `channel.web_chat`. | +| Frontend | `app/src/services/__tests__/chatService.test.ts` | modify | Assert new fields are passed through. | +| Frontend | `app/src/store/slices/ptt.ts` | create | Redux slice (`shortcut`, `speakReplies`, `showOverlay`, `isHeld`). | +| Frontend | `app/src/store/slices/__tests__/ptt.test.ts` | create | Slice unit tests. | +| Frontend | `app/src/store/index.ts` (or wherever rootReducer is) | modify | Register `ptt` slice + persist whitelist. | +| Frontend | `app/src/utils/tauriCommands/ptt.ts` | create | Wrappers for `register_ptt_hotkey` / `unregister_ptt_hotkey` / `show_ptt_overlay`. | +| Frontend | `app/src/hooks/usePttHotkey.ts` | create | Boot-time effect that registers the hotkey on rehydration. | +| Frontend | `app/src/components/PttHotkeyManager.tsx` | create | Renderless component mounted in `AppShell` that wires `usePttHotkey` + `pttService`. | +| Frontend | `app/src/AppShell.tsx` (or `App.tsx`) | modify | Mount ``. | +| Frontend | `app/src/pages/PttOverlayPage.tsx` | create | 160×56 borderless overlay UI. | +| Frontend | `app/src/pages/PttOverlayPage.test.tsx` | create | Render tests. | +| Frontend | `app/src/AppRoutes.tsx` | modify | Add `/ptt-overlay` route. | +| Frontend | `app/src/pages/settings/voice/PttSettingsPanel.tsx` | create | Hotkey capture + toggles. | +| Frontend | `app/src/pages/settings/voice/__tests__/PttSettingsPanel.test.tsx` | create | Component tests. | +| Frontend | `app/src/pages/settings/voice/VoiceSettingsPage.tsx` (or wherever the voice settings index lives) | modify | Mount the PTT panel. | +| Frontend | `app/src/assets/audio/ptt-open.wav` | create | Open chime (CC0). | +| Frontend | `app/src/assets/audio/ptt-close.wav` | create | Close chime (CC0). | +| Frontend | `app/src/assets/audio/ptt-error.wav` | create | Error chime (CC0). | +| Frontend | `app/src/assets/audio/README.md` | create | CC0 attribution. | +| i18n | `app/src/lib/i18n/en.ts` + 12 locale files | modify | New PTT keys (settings + overlay + error messages). | +| E2E | `app/test/e2e/specs/ptt-flow.spec.ts` | create | Full flow under WDIO with mocked STT. | + +Each task below ends in a single commit. Tasks are ordered so the tree compiles and tests pass at every boundary — start from core, work outward to the UI. + +--- + +## Task 1: `channel.web_chat` accepts `speak_reply` / `source` / `session_id` (schema + plumb-through) + +**Files:** +- Modify: `src/openhuman/channels/providers/web.rs` +- Test: `src/openhuman/channels/providers/web_tests.rs` + +The renderer-side call site (`chatService.chatSend`) needs to send these fields; the agent loop needs to remember them. This task wires the schema additions and threads the values from `channel_web_chat` → `start_chat` → progress bridge, but does **not yet** invoke TTS (that's Task 4). After this task the fields are accepted, logged, and otherwise ignored. + +- [ ] **Step 1.1: Write failing schema test for the new optional fields** + +Add to `src/openhuman/channels/providers/web_tests.rs`: + +```rust +#[test] +fn web_chat_schema_accepts_optional_ptt_fields() { + // Locate the `chat` schema via the public accessor. + let schema = crate::openhuman::channels::providers::web::schemas("chat"); + let names: std::collections::HashSet<&str> = + schema.inputs.iter().map(|f| f.name).collect(); + assert!( + names.contains("speak_reply"), + "channel.web_chat schema must include optional speak_reply field" + ); + assert!( + names.contains("source"), + "channel.web_chat schema must include optional source field" + ); + assert!( + names.contains("session_id"), + "channel.web_chat schema must include optional session_id field" + ); + // All three are optional. + for field in &["speak_reply", "source", "session_id"] { + let f = schema + .inputs + .iter() + .find(|f| f.name == *field) + .expect("field present"); + assert!(!f.required, "{field} must be optional"); + } +} + +#[test] +fn web_chat_params_deserialize_with_all_ptt_fields_omitted() { + use crate::openhuman::channels::providers::web::WebChatParams; + let json = serde_json::json!({ + "client_id": "c1", + "thread_id": "t1", + "message": "hello", + }); + let parsed: WebChatParams = serde_json::from_value(json).unwrap(); + assert_eq!(parsed.speak_reply, None); + assert_eq!(parsed.source, None); + assert_eq!(parsed.session_id, None); +} + +#[test] +fn web_chat_params_deserialize_with_all_ptt_fields_present() { + use crate::openhuman::channels::providers::web::WebChatParams; + let json = serde_json::json!({ + "client_id": "c1", + "thread_id": "t1", + "message": "hello", + "speak_reply": true, + "source": "ptt", + "session_id": 42_u64, + }); + let parsed: WebChatParams = serde_json::from_value(json).unwrap(); + assert_eq!(parsed.speak_reply, Some(true)); + assert_eq!(parsed.source.as_deref(), Some("ptt")); + assert_eq!(parsed.session_id, Some(42)); +} +``` + +- [ ] **Step 1.2: Run tests to verify they fail** + +```bash +pnpm debug rust web_chat_schema_accepts_optional_ptt_fields +pnpm debug rust web_chat_params_deserialize_with_all_ptt_fields +``` + +Expected: all three fail (`speak_reply` / `source` / `session_id` not in schema; `WebChatParams` has no such fields). + +- [ ] **Step 1.3: Add fields to schema and `WebChatParams`** + +In `src/openhuman/channels/providers/web.rs`, find the `schemas("chat")` arm and add three optional fields after `locale`: + +```rust +optional_bool("speak_reply", "When true, the agent's final reply is spoken via TTS (for PTT and similar background voice flows)."), +optional_string("source", "Origin of the message: \"ptt\" | \"dictation\" | \"type\" | other. Used for analytics + downstream metadata."), +optional_u64("session_id", "Optional caller-provided correlation id (PTT session id)."), +``` + +If `optional_bool` / `optional_u64` helpers don't exist in scope yet, add them following the `optional_string` / `optional_f64` pattern already in that file. Example (place near the other helpers): + +```rust +fn optional_bool(name: &'static str, comment: &'static str) -> FieldSchema { + FieldSchema { + name, + ty: TypeSchema::Option(Box::new(TypeSchema::Bool)), + comment, + required: false, + } +} + +fn optional_u64(name: &'static str, comment: &'static str) -> FieldSchema { + FieldSchema { + name, + ty: TypeSchema::Option(Box::new(TypeSchema::U64)), + comment, + required: false, + } +} +``` + +Then locate the `WebChatParams` struct (search `struct WebChatParams` in the same file) and add three fields: + +```rust +#[serde(default)] +pub speak_reply: Option, +#[serde(default)] +pub source: Option, +#[serde(default)] +pub session_id: Option, +``` + +- [ ] **Step 1.4: Run the schema tests to verify they pass** + +```bash +pnpm debug rust web_chat_schema_accepts_optional_ptt_fields +pnpm debug rust web_chat_params_deserialize_with_all_ptt_fields +``` + +Expected: PASS. + +- [ ] **Step 1.5: Propagate fields from `channel_web_chat` → `start_chat`** + +Find the existing `channel_web_chat` function (`pub async fn channel_web_chat`) and extend its signature with the three new optional fields. Then update `start_chat`'s signature the same way. Where the bridge is spawned (`spawn_progress_bridge(...)`), pass the new fields through. For this task they're just stored on a per-bridge struct field; Task 4 wires them to TTS. + +Concretely: locate `pub(super) struct ProgressBridgeContext` (or whatever struct already exists to carry bridge state — if none, add one) and add: + +```rust +pub(super) speak_reply: bool, +pub(super) source: Option, +pub(super) session_id: Option, +pub(super) final_assistant_text: String, // populated from TextDelta events in Task 4 +``` + +Update `handle_chat` to deserialize the new fields and pass them along. + +- [ ] **Step 1.6: Run cargo check** + +```bash +cargo check --manifest-path Cargo.toml +``` + +Expected: clean compile (warnings about unused `speak_reply` etc. acceptable — Task 4 consumes them). + +- [ ] **Step 1.7: Commit** + +```bash +git add src/openhuman/channels/providers/web.rs \ + src/openhuman/channels/providers/web_tests.rs +git commit -m "feat(channels/web): accept optional speak_reply/source/session_id on chat schema (#3090)" +``` + +--- + +## Task 2: `DomainEvent::Voice(VoiceEvent)` + `voice/bus.rs` + +**Files:** +- Modify: `src/core/event_bus/events.rs` +- Create: `src/openhuman/voice/bus.rs` +- Modify: `src/openhuman/voice/mod.rs` + +The bus event lets the future screen-capture follow-up subscribe to PTT commits without coupling. + +- [ ] **Step 2.1: Write failing publish/subscribe test** + +Create `src/openhuman/voice/bus.rs`: + +```rust +//! Voice domain event publishers. The PTT transcript-committed event is +//! published here so the future screen-intelligence follow-up can subscribe +//! and grab a frame on commit without coupling to the channel-web flow. + +use crate::core::event_bus::{publish_global, DomainEvent, VoiceEvent}; + +/// Publish a [`VoiceEvent::PttTranscriptCommitted`] event. +pub fn publish_ptt_transcript_committed( + thread_id: String, + session_id: u64, + text_len: usize, + held_ms: u64, + finalized_by_watchdog: bool, +) { + publish_global(DomainEvent::Voice(VoiceEvent::PttTranscriptCommitted { + thread_id, + session_id, + text_len, + held_ms, + finalized_by_watchdog, + })); +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::core::event_bus::{init_global, subscribe_global, DomainEvent, EventHandler}; + use async_trait::async_trait; + use std::sync::Arc; + use tokio::sync::Mutex as AsyncMutex; + + #[derive(Default)] + struct Capture { + events: Arc>>, + } + + #[async_trait] + impl EventHandler for Capture { + fn name(&self) -> &'static str { + "voice::ptt_test_capture" + } + async fn handle(&self, event: DomainEvent) { + self.events.lock().await.push(event); + } + } + + #[tokio::test] + async fn publishing_a_ptt_commit_reaches_a_subscriber() { + // Use the singleton (init is idempotent). + let _ = init_global(64); + let capture = Capture::default(); + let events = capture.events.clone(); + let _sub = subscribe_global(Box::new(capture)); + + publish_ptt_transcript_committed( + "thread-1".to_string(), + 42, + 17, + 850, + false, + ); + + // Give the broadcaster a tick to deliver. + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + + let got = events.lock().await; + assert!( + got.iter().any(|e| matches!( + e, + DomainEvent::Voice(VoiceEvent::PttTranscriptCommitted { + thread_id, session_id, .. + }) if thread_id == "thread-1" && *session_id == 42 + )), + "expected PttTranscriptCommitted in {got:?}", + ); + } +} +``` + +Add to `src/openhuman/voice/mod.rs`: + +```rust +pub mod bus; +``` + +- [ ] **Step 2.2: Run the test to verify it fails** + +```bash +pnpm debug rust publishing_a_ptt_commit_reaches_a_subscriber +``` + +Expected: FAIL — `VoiceEvent` is undefined and `DomainEvent::Voice` doesn't exist yet. + +- [ ] **Step 2.3: Add `VoiceEvent` and the `Voice` variant to `DomainEvent`** + +In `src/core/event_bus/events.rs`, add the enum (above or near `DomainEvent`): + +```rust +/// Voice-domain events. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum VoiceEvent { + /// A PTT session committed a transcript to a thread. Carries only + /// length/timing — never the raw text, per the PII-safe logging rule. + PttTranscriptCommitted { + thread_id: String, + session_id: u64, + text_len: usize, + held_ms: u64, + finalized_by_watchdog: bool, + }, +} +``` + +Then add to `DomainEvent`: + +```rust +Voice(VoiceEvent), +``` + +…and extend the `domain()` match arm with: + +```rust +DomainEvent::Voice(_) => Domain::Voice, +``` + +If `Domain::Voice` isn't already defined in the `Domain` enum in the same file, add it. + +- [ ] **Step 2.4: Run the test again** + +```bash +pnpm debug rust publishing_a_ptt_commit_reaches_a_subscriber +``` + +Expected: PASS. + +- [ ] **Step 2.5: Commit** + +```bash +git add src/core/event_bus/events.rs \ + src/openhuman/voice/bus.rs \ + src/openhuman/voice/mod.rs +git commit -m "feat(voice/bus): publish DomainEvent::Voice::PttTranscriptCommitted (#3090)" +``` + +--- + +## Task 3: `expand_ptt_shortcuts` + `PttError` (pure functions, fully tested) + +**Files:** +- Create: `app/src-tauri/src/ptt_hotkeys.rs` + +Mirrors `dictation_hotkeys::expand_dictation_shortcuts` but rejects pure-modifier shortcuts (which would be unusable as PTT keys). All Tauri / app state lives in the IPC commands (Task 5); this task is pure logic + tests only. + +- [ ] **Step 3.1: Write failing tests** + +Create `app/src-tauri/src/ptt_hotkeys.rs`: + +```rust +//! Global push-to-talk hotkey state + parsing. +//! +//! See spec: `docs/superpowers/specs/2026-06-02-global-ptt-design.md`. +//! +//! `expand_ptt_shortcuts` mirrors `dictation_hotkeys::expand_dictation_shortcuts` +//! but rejects pure-modifier shortcuts (Ctrl, Cmd+Shift, etc.) because they +//! would fire constantly during normal typing. + +use std::sync::atomic::AtomicU64; +use std::sync::Mutex; + +#[derive(Debug, PartialEq, Eq)] +pub enum PttError { + EmptyShortcut, + ModifierOnlyShortcut, + ConflictsWithDictation(String), + UnsupportedOnWayland, + RegistrationFailed(String), +} + +impl std::fmt::Display for PttError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + PttError::EmptyShortcut => write!(f, "ptt shortcut cannot be empty"), + PttError::ModifierOnlyShortcut => write!( + f, + "ptt shortcut cannot be only modifier keys (Ctrl/Cmd/Shift/Alt)" + ), + PttError::ConflictsWithDictation(s) => { + write!(f, "ptt shortcut '{s}' conflicts with the dictation hotkey") + } + PttError::UnsupportedOnWayland => write!( + f, + "global shortcuts are not supported in this Wayland session — switch to X11 or use in-app dictation" + ), + PttError::RegistrationFailed(s) => { + write!(f, "failed to register ptt shortcut: {s}") + } + } + } +} + +impl std::error::Error for PttError {} + +/// Process-wide PTT state. Held in the Tauri-managed `State`. +pub(crate) struct PttHotkeyState { + /// Currently-registered shortcut variants (e.g. `["Cmd+F13", "Ctrl+F13"]` on macOS). + pub(crate) shortcut: Mutex>, + /// Monotonic counter for session IDs. + pub(crate) session_counter: AtomicU64, +} + +impl PttHotkeyState { + pub(crate) fn new() -> Self { + Self { + shortcut: Mutex::new(Vec::new()), + session_counter: AtomicU64::new(0), + } + } +} + +const MODIFIER_TOKENS: &[&str] = &[ + "ctrl", + "control", + "cmd", + "command", + "meta", + "super", + "win", + "windows", + "alt", + "option", + "shift", + "cmdorctrl", +]; + +fn is_modifier_token(token: &str) -> bool { + let lower = token.trim().to_ascii_lowercase(); + MODIFIER_TOKENS.iter().any(|m| *m == lower) +} + +/// Expand a user-typed shortcut into one or two OS-specific variants and +/// validate it isn't empty / modifier-only. +pub(crate) fn expand_ptt_shortcuts(shortcut: &str) -> Result, PttError> { + let trimmed = shortcut.trim(); + if trimmed.is_empty() { + return Err(PttError::EmptyShortcut); + } + + let parts: Vec<&str> = trimmed.split('+').map(str::trim).collect(); + if parts.iter().all(|p| is_modifier_token(p)) { + return Err(PttError::ModifierOnlyShortcut); + } + + #[cfg(target_os = "macos")] + { + if trimmed.contains("CmdOrCtrl") { + let cmd_variant = trimmed.replace("CmdOrCtrl", "Cmd"); + let ctrl_variant = trimmed.replace("CmdOrCtrl", "Ctrl"); + if cmd_variant == ctrl_variant { + return Ok(vec![cmd_variant]); + } + return Ok(vec![cmd_variant, ctrl_variant]); + } + } + + #[cfg(not(target_os = "macos"))] + { + if trimmed.contains("CmdOrCtrl") { + return Ok(vec![trimmed.replace("CmdOrCtrl", "Ctrl")]); + } + } + + Ok(vec![trimmed.to_string()]) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty_shortcut_is_rejected() { + assert_eq!(expand_ptt_shortcuts(""), Err(PttError::EmptyShortcut)); + assert_eq!(expand_ptt_shortcuts(" "), Err(PttError::EmptyShortcut)); + } + + #[test] + fn modifier_only_shortcut_is_rejected() { + assert_eq!( + expand_ptt_shortcuts("Ctrl"), + Err(PttError::ModifierOnlyShortcut) + ); + assert_eq!( + expand_ptt_shortcuts("Cmd+Shift"), + Err(PttError::ModifierOnlyShortcut) + ); + assert_eq!( + expand_ptt_shortcuts("Alt+Shift+Ctrl"), + Err(PttError::ModifierOnlyShortcut) + ); + assert_eq!( + expand_ptt_shortcuts("CmdOrCtrl+Shift"), + Err(PttError::ModifierOnlyShortcut) + ); + } + + #[test] + fn plain_function_key_is_accepted() { + assert_eq!(expand_ptt_shortcuts("F13"), Ok(vec!["F13".to_string()])); + } + + #[test] + fn modifier_plus_letter_is_accepted() { + assert_eq!( + expand_ptt_shortcuts("Ctrl+Alt+T"), + Ok(vec!["Ctrl+Alt+T".to_string()]) + ); + } + + #[test] + #[cfg(target_os = "macos")] + fn cmd_or_ctrl_expands_to_both_on_macos() { + let result = expand_ptt_shortcuts("CmdOrCtrl+Shift+P").unwrap(); + assert_eq!(result.len(), 2); + assert!(result.contains(&"Cmd+Shift+P".to_string())); + assert!(result.contains(&"Ctrl+Shift+P".to_string())); + } + + #[test] + #[cfg(not(target_os = "macos"))] + fn cmd_or_ctrl_expands_to_ctrl_off_macos() { + let result = expand_ptt_shortcuts("CmdOrCtrl+Shift+P").unwrap(); + assert_eq!(result, vec!["Ctrl+Shift+P".to_string()]); + } +} +``` + +Also wire the module into the Tauri shell: add to `app/src-tauri/src/lib.rs` (near the other `mod` lines, around the existing `mod dictation_hotkeys;`): + +```rust +mod ptt_hotkeys; +``` + +- [ ] **Step 3.2: Run tests to verify they fail / verify pass** + +```bash +cargo test --manifest-path app/src-tauri/Cargo.toml ptt_hotkeys +``` + +Expected: PASS (all 6 tests; this task is implementation + tests in the same file, so they pass together — the TDD value here is the test code itself being committed alongside). + +- [ ] **Step 3.3: Run `cargo fmt`** + +```bash +cargo fmt --manifest-path app/src-tauri/Cargo.toml +``` + +- [ ] **Step 3.4: Commit** + +```bash +git add app/src-tauri/src/ptt_hotkeys.rs app/src-tauri/src/lib.rs +git commit -m "feat(tauri/ptt): add ptt_hotkeys module with shortcut expansion + validation (#3090)" +``` + +--- + +## Task 4: Wire `speak_reply` to `reply_speech` via the progress bridge (with test seam) + +**Files:** +- Modify: `src/openhuman/channels/providers/web.rs` (extend the progress bridge `TurnCompleted` handler) +- Modify: `src/openhuman/voice/reply_speech.rs` (add a test seam if none exists) +- Modify: `tests/json_rpc_e2e.rs` + +The progress bridge already receives `AgentProgress::TextDelta` events during the turn and `TurnCompleted` when the turn finishes. We accumulate the deltas and, on `TurnCompleted`, if `speak_reply` was set, hand the final text to `reply_speech`. + +- [ ] **Step 4.1: Add a test seam to `reply_speech`** + +If `reply_speech.rs` already exposes a way to intercept calls for testing, skip ahead to 4.2. Otherwise add a static observer: + +In `src/openhuman/voice/reply_speech.rs`, near the top of the file: + +```rust +#[cfg(test)] +pub mod test_seam { + use once_cell::sync::Lazy; + use std::sync::Mutex; + + pub static OBSERVED_CALLS: Lazy>> = + Lazy::new(|| Mutex::new(Vec::new())); + + pub fn clear() { + OBSERVED_CALLS.lock().unwrap().clear(); + } + pub fn observed() -> Vec { + OBSERVED_CALLS.lock().unwrap().clone() + } +} +``` + +In whichever function plays TTS (search the file for `pub async fn` and locate `synthesize_and_play` or similar — likely `pub async fn synthesize_and_play(text: &str)` or `pub async fn speak`), at the very top of the function add: + +```rust +#[cfg(test)] +{ + test_seam::OBSERVED_CALLS + .lock() + .unwrap() + .push(text.to_string()); + return Ok(()); +} +``` + +If the real return type isn't `Result<(), …>`, adapt the `return` to the actual signature (e.g. `return;` for `-> ()`). + +- [ ] **Step 4.2: Write failing E2E test in `tests/json_rpc_e2e.rs`** + +Add a new test at the end of the file: + +```rust +#[tokio::test] +async fn channel_web_chat_with_speak_reply_invokes_reply_speech() { + use openhuman::openhuman::voice::reply_speech::test_seam; + + test_seam::clear(); + + // Stand up the JSON-RPC harness — mirror an existing test in this file + // (e.g. the chat happy-path test); the helper functions for spawning the + // server + opening a client live in this file already. + let (client, _server_guard) = spawn_test_server().await; + + // Open a socket / acquire a client_id the same way the existing chat + // tests do (search for "client_id" usage in this file for the pattern). + let client_id = open_test_socket(&client).await; + let thread_id = create_test_thread(&client).await; + + // Send a web chat with speak_reply=true. + let resp = client + .call( + "openhuman.channel_web_chat", + serde_json::json!({ + "client_id": client_id, + "thread_id": thread_id, + "message": "hello", + "speak_reply": true, + "source": "ptt", + "session_id": 1_u64, + }), + ) + .await + .expect("rpc ok"); + assert_eq!(resp["accepted"], true); + + // Wait up to 10s for the agent turn to complete. + wait_for_turn_complete(&client, &client_id, &thread_id, 10_000).await; + + let observed = test_seam::observed(); + assert!( + !observed.is_empty(), + "expected reply_speech to be invoked when speak_reply=true, but observed no calls" + ); +} +``` + +If helper names (`spawn_test_server`, `open_test_socket`, `create_test_thread`, `wait_for_turn_complete`) don't already exist in `tests/json_rpc_e2e.rs`, use whichever helpers the existing chat test in that file uses — copy its shape and replace the params with the new fields. + +- [ ] **Step 4.3: Run the E2E to verify it fails** + +```bash +pnpm debug rust channel_web_chat_with_speak_reply_invokes_reply_speech +``` + +Expected: FAIL — bridge does not call `reply_speech` yet. + +- [ ] **Step 4.4: Wire the bridge to invoke `reply_speech` on `TurnCompleted`** + +In `src/openhuman/channels/providers/web.rs`, locate `spawn_progress_bridge`. We need to: +1. Buffer assistant text from `AgentProgress::TextDelta` (already received in the existing match — extend the arm). +2. On `AgentProgress::TurnCompleted`, if `speak_reply == true`, call `reply_speech::synthesize_and_play(buffered).await`. + +Pseudocode patch (apply against the actual file structure): + +```rust +let mut final_assistant_text = String::new(); +// ...inside the existing `while let Some(event) = rx.recv().await` loop: +match &event { + AgentProgress::TextDelta { delta, .. } => { + // existing log + bridge code preserved + final_assistant_text.push_str(delta); + } + AgentProgress::TurnCompleted { iterations } => { + log::debug!( + "[web_channel][bridge] turn_completed iterations={iterations} request_id={request_id} speak_reply={speak_reply}", + ); + if speak_reply && !final_assistant_text.trim().is_empty() { + let text = final_assistant_text.clone(); + tokio::spawn(async move { + if let Err(e) = + crate::openhuman::voice::reply_speech::synthesize_and_play(&text).await + { + log::warn!("[web_channel][bridge] reply_speech failed: {e}"); + } + }); + } + // Publish the PTT bus event when source == "ptt". + if source.as_deref() == Some("ptt") { + if let Some(sid) = session_id { + crate::openhuman::voice::bus::publish_ptt_transcript_committed( + thread_id.clone(), + sid, + final_assistant_text.len(), + /* held_ms */ 0, // filled by Task 13 when the renderer passes it + false, + ); + } + } + } + // ...other existing arms unchanged +} +``` + +Threading the `speak_reply`, `source`, `session_id` values into `spawn_progress_bridge` requires extending the function's signature. Add them as `Option<…>`/`bool` params and thread from `start_chat → channel_web_chat`. + +If `reply_speech::synthesize_and_play`'s real signature is different (e.g. takes `String` by value or returns a different `Result` type), adapt the call site to the real signature — check the function definition in `src/openhuman/voice/reply_speech.rs` first. + +- [ ] **Step 4.5: Run the E2E again** + +```bash +pnpm debug rust channel_web_chat_with_speak_reply_invokes_reply_speech +``` + +Expected: PASS. + +- [ ] **Step 4.6: Run unrelated chat tests to verify no regression** + +```bash +pnpm debug rust web_channel +pnpm debug rust json_rpc_e2e +``` + +Expected: green. + +- [ ] **Step 4.7: Commit** + +```bash +git add src/openhuman/channels/providers/web.rs \ + src/openhuman/voice/reply_speech.rs \ + tests/json_rpc_e2e.rs +git commit -m "feat(channels/web): invoke reply_speech + publish PttTranscriptCommitted on speak_reply=true (#3090)" +``` + +--- + +## Task 5: Tauri IPC commands `register_ptt_hotkey` / `unregister_ptt_hotkey` + conflict check + +**Files:** +- Modify: `app/src-tauri/src/lib.rs` +- Modify: `app/src-tauri/src/ptt_hotkeys.rs` (add a small conflict-helper fn) + +- [ ] **Step 5.1: Add the conflict helper to `ptt_hotkeys.rs`** + +Append to the same file: + +```rust +/// Returns `Some(conflicting_variant)` if any expanded PTT variant overlaps +/// any expanded dictation variant. Comparison is case-insensitive. +pub(crate) fn first_conflict_with( + ptt: &[String], + dictation: &[String], +) -> Option { + for p in ptt { + let p_lc = p.to_ascii_lowercase(); + for d in dictation { + if d.to_ascii_lowercase() == p_lc { + return Some(p.clone()); + } + } + } + None +} + +#[cfg(test)] +mod conflict_tests { + use super::*; + + #[test] + fn no_conflict_returns_none() { + let ptt = vec!["F13".into()]; + let dict = vec!["F14".into()]; + assert_eq!(first_conflict_with(&ptt, &dict), None); + } + + #[test] + fn case_insensitive_conflict_detected() { + let ptt = vec!["ctrl+space".into()]; + let dict = vec!["Ctrl+Space".into()]; + assert_eq!( + first_conflict_with(&ptt, &dict), + Some("ctrl+space".to_string()) + ); + } + + #[test] + fn only_one_variant_overlaps_returns_first() { + let ptt = vec!["Cmd+P".into(), "Ctrl+P".into()]; + let dict = vec!["Ctrl+P".into()]; + assert_eq!( + first_conflict_with(&ptt, &dict), + Some("Ctrl+P".to_string()) + ); + } +} +``` + +- [ ] **Step 5.2: Run conflict tests** + +```bash +cargo test --manifest-path app/src-tauri/Cargo.toml ptt_hotkeys::conflict_tests +``` + +Expected: PASS. + +- [ ] **Step 5.3: Add the two IPC commands to `lib.rs`** + +In `app/src-tauri/src/lib.rs`, near the existing `register_dictation_hotkey`: + +```rust +/// Register (or re-register) the global push-to-talk hotkey. Emits +/// `ptt://start { session_id }` on press and `ptt://stop { session_id }` +/// on release. +#[tauri::command] +async fn register_ptt_hotkey( + app: AppHandle, + shortcut: String, +) -> Result<(), String> { + log::info!("[ptt] register_ptt_hotkey: shortcut={shortcut}"); + + let expanded = ptt_hotkeys::expand_ptt_shortcuts(&shortcut) + .map_err(|e| e.to_string())?; + + // Reject overlap with the currently-registered dictation hotkey. + let dictation_current = { + let state = app.state::(); + let guard = state.0.lock().unwrap(); + guard.clone() + }; + if let Some(conflict) = + ptt_hotkeys::first_conflict_with(&expanded, &dictation_current) + { + return Err(ptt_hotkeys::PttError::ConflictsWithDictation(conflict).to_string()); + } + + let old_shortcuts = { + let state = app.state::(); + let guard = state.shortcut.lock().unwrap(); + guard.clone() + }; + + // Lazy-instantiate the overlay window so it's ready before the first press. + if let Err(e) = ptt_overlay::ensure_window(&app) { + log::warn!("[ptt] overlay window create failed (continuing): {e}"); + } + + let register_shortcut = |variant: &str| -> Result<(), String> { + let app_pressed = app.clone(); + let app_released = app.clone(); + let variant_owned = variant.to_string(); + app.global_shortcut() + .on_shortcut(variant, move |app_inner, _sc, event| { + let state = app_inner.state::(); + match event.state { + ShortcutState::Pressed => { + // Atomically bump the counter and emit start. + let session_id = state + .session_counter + .fetch_add(1, std::sync::atomic::Ordering::SeqCst) + + 1; + log::debug!( + "[ptt] pressed shortcut={variant_owned} session_id={session_id}" + ); + if let Err(e) = + app_pressed.emit("ptt://start", serde_json::json!({ + "session_id": session_id, + })) + { + log::warn!("[ptt] emit start failed: {e}"); + } + } + ShortcutState::Released => { + let session_id = state + .session_counter + .load(std::sync::atomic::Ordering::SeqCst); + log::debug!( + "[ptt] released shortcut={variant_owned} session_id={session_id}" + ); + if let Err(e) = + app_released.emit("ptt://stop", serde_json::json!({ + "session_id": session_id, + })) + { + log::warn!("[ptt] emit stop failed: {e}"); + } + } + } + }) + .map_err(|e| format!("Failed to register ptt shortcut '{variant}': {e}")) + }; + + // Unregister previous PTT variants. + let mut unregistered: Vec = Vec::new(); + for old in &old_shortcuts { + if let Err(e) = app.global_shortcut().unregister(old.as_str()) { + // Rollback already-unregistered ones. + for r in &unregistered { + let _ = register_shortcut(r); + } + return Err(format!("Failed to unregister previous ptt shortcut '{old}': {e}")); + } + unregistered.push(old.clone()); + } + + // Register the new variants. Rollback on first failure. + let mut newly_registered: Vec = Vec::new(); + for v in &expanded { + if let Err(e) = register_shortcut(v) { + for r in &newly_registered { + let _ = app.global_shortcut().unregister(r.as_str()); + } + for old in &old_shortcuts { + let _ = register_shortcut(old); + } + return Err(e); + } + newly_registered.push(v.clone()); + } + + { + let state = app.state::(); + let mut guard = state.shortcut.lock().unwrap(); + *guard = expanded.clone(); + } + + log::info!("[ptt] registered: {}", expanded.join(", ")); + Ok(()) +} + +/// Unregister the global PTT hotkey (if any). +#[tauri::command] +async fn unregister_ptt_hotkey(app: AppHandle) -> Result<(), String> { + log::info!("[ptt] unregister_ptt_hotkey: called"); + let state = app.state::(); + let old = { + let mut guard = state.shortcut.lock().unwrap(); + let v = guard.clone(); + guard.clear(); + v + }; + for s in &old { + if let Err(e) = app.global_shortcut().unregister(s.as_str()) { + log::warn!("[ptt] unregister '{s}' failed: {e}"); + } + } + // Destroy the overlay window so resources are released. + ptt_overlay::destroy_window(&app); + Ok(()) +} +``` + +Then wire state + commands. In the same file, find `.manage(dictation_hotkeys::DictationHotkeyState(...))` near `Builder::default()` and add: + +```rust +.manage(ptt_hotkeys::PttHotkeyState::new()) +``` + +And in the `tauri::generate_handler!` invocation, add: + +```rust +register_ptt_hotkey, +unregister_ptt_hotkey, +show_ptt_overlay, +``` + +(`show_ptt_overlay` is added in Task 6; if you're running this task standalone, comment it out and re-enable in Task 6.) + +- [ ] **Step 5.4: Add reverse conflict check to dictation register** + +In `register_dictation_hotkey` (existing function), after the existing `expand_dictation_shortcuts` call, add a symmetric check: + +```rust +// Reject overlap with the currently-registered PTT hotkey. +let ptt_current = { + let state = app.state::(); + let guard = state.shortcut.lock().unwrap(); + guard.clone() +}; +if let Some(conflict) = + ptt_hotkeys::first_conflict_with(&expanded_shortcuts, &ptt_current) +{ + return Err(format!( + "dictation shortcut '{conflict}' conflicts with the push-to-talk hotkey" + )); +} +``` + +- [ ] **Step 5.5: Run cargo check on the Tauri shell** + +```bash +pnpm rust:check +``` + +Expected: clean compile (or compile errors only from the `show_ptt_overlay` reference, fixed in Task 6). + +- [ ] **Step 5.6: Commit** + +```bash +git add app/src-tauri/src/ptt_hotkeys.rs app/src-tauri/src/lib.rs +git commit -m "feat(tauri/ptt): register/unregister IPC + dictation conflict guard (#3090)" +``` + +--- + +## Task 6: `ptt_overlay.rs` lazy borderless window + `show_ptt_overlay` IPC + +**Files:** +- Create: `app/src-tauri/src/ptt_overlay.rs` +- Modify: `app/src-tauri/src/lib.rs` (add `mod ptt_overlay;` + the IPC command) + +- [ ] **Step 6.1: Create the module** + +`app/src-tauri/src/ptt_overlay.rs`: + +```rust +//! Borderless always-on-top PTT overlay window. +//! +//! Lazy-created on the first `register_ptt_hotkey` call (so the window is +//! ready when the user hits the key for the first time), and destroyed by +//! `unregister_ptt_hotkey`. The window's contents are rendered by the React +//! route `/ptt-overlay` (see `app/src/pages/PttOverlayPage.tsx`). +//! +//! Cross-platform note: `focus(false)` ensures the window never steals focus +//! from the user's active app. `skip_taskbar(true)` keeps it out of the +//! Windows taskbar / macOS dock. `visible_on_all_workspaces(true)` makes it +//! follow the user across macOS Spaces. DXGI exclusive-fullscreen on Windows +//! still suppresses the overlay — documented in the settings panel as a +//! limitation; chime audio remains the fallback signal. + +use tauri::{AppHandle, Emitter, Manager, Runtime, WebviewUrl, WebviewWindowBuilder}; + +const OVERLAY_LABEL: &str = "ptt-overlay"; + +/// Ensure the overlay window exists. Idempotent — if the window already +/// exists, returns Ok without recreating it. +pub(crate) fn ensure_window(app: &AppHandle) -> Result<(), String> { + if app.get_webview_window(OVERLAY_LABEL).is_some() { + return Ok(()); + } + let url = WebviewUrl::App("index.html#/ptt-overlay".into()); + let mut builder = WebviewWindowBuilder::new(app, OVERLAY_LABEL, url) + .title("OpenHuman Push-to-Talk") + .inner_size(160.0, 56.0) + .decorations(false) + .transparent(true) + .always_on_top(true) + .skip_taskbar(true) + .focused(false) + .resizable(false) + .shadow(false) + .visible(false) + .accept_first_mouse(false); + + #[cfg(target_os = "macos")] + { + builder = builder.visible_on_all_workspaces(true); + } + + let _window = builder + .build() + .map_err(|e| format!("create ptt overlay window: {e}"))?; + log::info!("[ptt-overlay] window created (label={OVERLAY_LABEL})"); + Ok(()) +} + +/// Destroy the overlay window if it exists. +pub(crate) fn destroy_window(app: &AppHandle) { + if let Some(w) = app.get_webview_window(OVERLAY_LABEL) { + if let Err(e) = w.destroy() { + log::warn!("[ptt-overlay] destroy failed: {e}"); + } else { + log::info!("[ptt-overlay] window destroyed"); + } + } +} + +/// Show or hide the overlay. Emits `ptt-overlay://active` for the in-window +/// React tree to drive its pulsing-dot animation. +#[tauri::command] +pub(crate) async fn show_ptt_overlay( + app: AppHandle, + active: bool, + session_id: u64, +) -> Result<(), String> { + let window = app + .get_webview_window(OVERLAY_LABEL) + .ok_or_else(|| "ptt overlay window not created — register a hotkey first".to_string())?; + + if active { + window + .show() + .map_err(|e| format!("show overlay: {e}"))?; + } else { + window + .hide() + .map_err(|e| format!("hide overlay: {e}"))?; + } + + if let Err(e) = window.emit( + "ptt-overlay://active", + serde_json::json!({ + "active": active, + "session_id": session_id, + }), + ) { + log::warn!("[ptt-overlay] emit active failed: {e}"); + } + + Ok(()) +} +``` + +- [ ] **Step 6.2: Wire it into `lib.rs`** + +In `app/src-tauri/src/lib.rs`, near `mod ptt_hotkeys;`: + +```rust +mod ptt_overlay; +``` + +Confirm `show_ptt_overlay` is present in the `tauri::generate_handler!` macro invocation (added in Task 5.3); if it was commented out there, uncomment now. + +- [ ] **Step 6.3: Run `pnpm rust:check`** + +```bash +pnpm rust:check +``` + +Expected: clean compile. + +- [ ] **Step 6.4: Commit** + +```bash +git add app/src-tauri/src/ptt_overlay.rs app/src-tauri/src/lib.rs +git commit -m "feat(tauri/ptt): lazy borderless always-on-top overlay window (#3090)" +``` + +--- + +## Task 7: Chime assets + README + +**Files:** +- Create: `app/src/assets/audio/ptt-open.wav` +- Create: `app/src/assets/audio/ptt-close.wav` +- Create: `app/src/assets/audio/ptt-error.wav` +- Create: `app/src/assets/audio/README.md` + +WAVs ~80ms, LUFS-normalized to match the existing in-app notification sound (target ~ -16 LUFS). Use CC0-licensed source clips (e.g. from `freesound.org`'s CC0 collection or similar) — three short tones. + +- [ ] **Step 7.1: Add the three WAV files** + +Source three short CC0 WAV clips. Suggested: +- `ptt-open.wav`: rising 800Hz→1200Hz square wave, 80ms. +- `ptt-close.wav`: falling 1200Hz→800Hz square wave, 80ms. +- `ptt-error.wav`: two 150Hz pulses 60ms apart, 120ms total. + +If generating with `sox`: + +```bash +sox -n app/src/assets/audio/ptt-open.wav synth 0.08 sine 800-1200 norm -16 +sox -n app/src/assets/audio/ptt-close.wav synth 0.08 sine 1200-800 norm -16 +sox -n app/src/assets/audio/ptt-error.wav synth 0.06 sine 150 : synth 0.06 sine 0 : synth 0.06 sine 150 norm -16 +``` + +(If `sox` isn't available, hand-source equivalent CC0 clips and store them at the same paths.) + +- [ ] **Step 7.2: Add `README.md`** + +`app/src/assets/audio/README.md`: + +```markdown +# Audio assets + +Short UI chimes for the push-to-talk feature (`docs/superpowers/specs/2026-06-02-global-ptt-design.md`). + +| File | Purpose | Source | License | +| --- | --- | --- | --- | +| `ptt-open.wav` | Mic opened (PTT key pressed). | Generated locally with `sox synth`. | CC0 / Public Domain. | +| `ptt-close.wav` | Mic closed (PTT key released). | Generated locally with `sox synth`. | CC0 / Public Domain. | +| `ptt-error.wav` | Session aborted (empty audio, mic permission denied, etc.). | Generated locally with `sox synth`. | CC0 / Public Domain. | + +All clips are ~80–120ms, LUFS-normalized to roughly match the in-app notification sound (~ -16 LUFS). Replace freely with better-sounding equivalents — just keep them under 200ms and CC0/MIT-equivalent. +``` + +- [ ] **Step 7.3: Verify file presence** + +```bash +ls -la app/src/assets/audio/ +file app/src/assets/audio/*.wav +``` + +Expected: each file exists and is identified as a RIFF WAV. + +- [ ] **Step 7.4: Commit** + +```bash +git add app/src/assets/audio/ +git commit -m "assets(ptt): bundle CC0 open/close/error chimes (#3090)" +``` + +--- + +## Task 8: `ptt` redux slice + persistence + +**Files:** +- Create: `app/src/store/slices/ptt.ts` +- Create: `app/src/store/slices/__tests__/ptt.test.ts` +- Modify: `app/src/store/index.ts` (or wherever rootReducer + persistConfig live) + +- [ ] **Step 8.1: Write failing slice test** + +`app/src/store/slices/__tests__/ptt.test.ts`: + +```ts +import { describe, expect, it } from 'vitest'; + +import { + pttReducer, + setPttShortcut, + setSpeakReplies, + setShowOverlay, + setIsHeld, + type PttState, +} from '../ptt'; + +describe('ptt slice', () => { + const initial: PttState = { + shortcut: null, + speakReplies: true, + showOverlay: true, + isHeld: false, + }; + + it('has the documented default state', () => { + expect(pttReducer(undefined, { type: '@@INIT' })).toEqual(initial); + }); + + it('setPttShortcut stores the shortcut string', () => { + const next = pttReducer(initial, setPttShortcut('F13')); + expect(next.shortcut).toBe('F13'); + }); + + it('setPttShortcut with null clears the shortcut', () => { + const withKey: PttState = { ...initial, shortcut: 'F13' }; + const next = pttReducer(withKey, setPttShortcut(null)); + expect(next.shortcut).toBeNull(); + }); + + it('setSpeakReplies toggles the flag', () => { + expect(pttReducer(initial, setSpeakReplies(false)).speakReplies).toBe(false); + }); + + it('setShowOverlay toggles the flag', () => { + expect(pttReducer(initial, setShowOverlay(false)).showOverlay).toBe(false); + }); + + it('setIsHeld updates the runtime hold flag', () => { + expect(pttReducer(initial, setIsHeld(true)).isHeld).toBe(true); + }); +}); +``` + +- [ ] **Step 8.2: Run failing test** + +```bash +pnpm debug unit app/src/store/slices/__tests__/ptt.test.ts +``` + +Expected: FAIL — slice file does not exist yet. + +- [ ] **Step 8.3: Implement the slice** + +`app/src/store/slices/ptt.ts`: + +```ts +import { createSlice, type PayloadAction } from '@reduxjs/toolkit'; + +export interface PttState { + /** Currently-bound PTT hotkey string (e.g. "F13" or "Ctrl+Alt+T"). null = unbound. */ + shortcut: string | null; + /** When true, the agent's reply is spoken via TTS. */ + speakReplies: boolean; + /** When true, the overlay window is shown during a PTT session. */ + showOverlay: boolean; + /** Non-persisted runtime flag: is the PTT key currently held? */ + isHeld: boolean; +} + +export const initialPttState: PttState = { + shortcut: null, + speakReplies: true, + showOverlay: true, + isHeld: false, +}; + +const pttSlice = createSlice({ + name: 'ptt', + initialState: initialPttState, + reducers: { + setPttShortcut(state, action: PayloadAction) { + state.shortcut = action.payload; + }, + setSpeakReplies(state, action: PayloadAction) { + state.speakReplies = action.payload; + }, + setShowOverlay(state, action: PayloadAction) { + state.showOverlay = action.payload; + }, + setIsHeld(state, action: PayloadAction) { + state.isHeld = action.payload; + }, + }, +}); + +export const { setPttShortcut, setSpeakReplies, setShowOverlay, setIsHeld } = + pttSlice.actions; +export const pttReducer = pttSlice.reducer; +``` + +- [ ] **Step 8.4: Run slice test to verify pass** + +```bash +pnpm debug unit app/src/store/slices/__tests__/ptt.test.ts +``` + +Expected: PASS. + +- [ ] **Step 8.5: Register the slice in the root store** + +Open `app/src/store/index.ts` (or whichever file builds the root reducer — search for `combineReducers` or the existing `dictation` / `voice` slice registration). + +Add the import + register in `combineReducers`: + +```ts +import { pttReducer } from './slices/ptt'; +// ... +const rootReducer = combineReducers({ + // ...existing entries + ptt: pttReducer, +}); +``` + +If a `persistWhitelist` / `persistConfig.whitelist` array exists, add `'ptt'`. The `isHeld` field is non-persisted by being a separate runtime concern — for simple slice-level redux-persist, leave it in the slice; rehydration will reset to `false` if you exclude it via a `blacklist` of nested keys, but the simpler approach is to accept it being rehydrated and have the boot hook explicitly reset it (see Task 11). + +If using `redux-persist`'s `createTransform` to strip `isHeld`, you can add (in the same file): + +```ts +import { createTransform } from 'redux-persist'; + +const stripIsHeld = createTransform>( + (state) => { + const { isHeld: _isHeld, ...rest } = state; + return rest; + }, + (state) => ({ ...state, isHeld: false }), + { whitelist: ['ptt'] }, +); +``` + +…and add `stripIsHeld` to `persistConfig.transforms`. If `transforms` doesn't already exist in the persistConfig, this is over-engineering — accept the rehydrated value for now and reset in Task 11. + +- [ ] **Step 8.6: Run the broader unit suite to verify no regression** + +```bash +pnpm debug unit +``` + +Expected: green. + +- [ ] **Step 8.7: Commit** + +```bash +git add app/src/store/slices/ptt.ts \ + app/src/store/slices/__tests__/ptt.test.ts \ + app/src/store/index.ts +git commit -m "feat(store/ptt): redux slice for ptt hotkey + settings (#3090)" +``` + +--- + +## Task 9: Tauri-command wrappers + chatService forwards `speak_reply` + +**Files:** +- Create: `app/src/utils/tauriCommands/ptt.ts` +- Modify: `app/src/services/chatService.ts` +- Modify: `app/src/services/__tests__/chatService.test.ts` + +- [ ] **Step 9.1: Write a failing chatService test for the new fields** + +In `app/src/services/__tests__/chatService.test.ts`, add a new test alongside the existing `'channel_web_chat'` one (find the assertion block at line ~216): + +```ts +it('forwards speak_reply, source, session_id when provided', async () => { + // Set up the same fixtures the surrounding test uses (mock socket, mock callCoreRpc, etc.). + // Mirror the existing test's setup precisely — only the call args differ. + await chatSend({ + threadId: 'thread-1', + message: 'hello', + speakReply: true, + source: 'ptt', + sessionId: 42, + }); + + expect(callCoreRpcSpy).toHaveBeenCalledWith( + expect.objectContaining({ + method: 'openhuman.channel_web_chat', + params: expect.objectContaining({ + message: 'hello', + speak_reply: true, + source: 'ptt', + session_id: 42, + }), + }), + ); +}); + +it('does not include the new fields when omitted', async () => { + await chatSend({ threadId: 'thread-1', message: 'hi' }); + const params = callCoreRpcSpy.mock.calls[0][0].params; + expect(params.speak_reply).toBeUndefined(); + expect(params.source).toBeUndefined(); + expect(params.session_id).toBeUndefined(); +}); +``` + +(Adapt `callCoreRpcSpy` to the existing test file's name for the spy on `callCoreRpc`.) + +- [ ] **Step 9.2: Run failing test** + +```bash +pnpm debug unit app/src/services/__tests__/chatService.test.ts +``` + +Expected: FAIL — `ChatSendParams` does not include `speakReply` / `source` / `sessionId` yet. + +- [ ] **Step 9.3: Extend `chatService.chatSend`** + +In `app/src/services/chatService.ts`, find `ChatSendParams` and add three optional fields: + +```ts +export interface ChatSendParams { + // ...existing fields + speakReply?: boolean; + source?: string; + sessionId?: number; +} +``` + +In `chatSend`, extend the `params` object: + +```ts +await callCoreRpc({ + method: 'openhuman.channel_web_chat', + params: { + client_id: clientId, + thread_id: params.threadId, + message: params.message, + model_override: params.model ?? undefined, + profile_id: params.profileId ?? undefined, + locale: params.locale ?? undefined, + speak_reply: params.speakReply ?? undefined, + source: params.source ?? undefined, + session_id: params.sessionId ?? undefined, + }, +}); +``` + +- [ ] **Step 9.4: Run chatService tests to verify pass** + +```bash +pnpm debug unit app/src/services/__tests__/chatService.test.ts +``` + +Expected: PASS. + +- [ ] **Step 9.5: Create the Tauri-command wrappers** + +`app/src/utils/tauriCommands/ptt.ts`: + +```ts +import { isTauri } from '../../services/webviewAccountService'; +import { invoke } from '@tauri-apps/api/core'; + +/** Register (or re-register) the global push-to-talk hotkey. */ +export async function registerPttHotkey(shortcut: string): Promise { + if (!isTauri()) { + console.debug('[ptt] registerPttHotkey: skipped — not running in Tauri'); + return; + } + console.debug('[ptt] registerPttHotkey: shortcut=%s', shortcut); + await invoke('register_ptt_hotkey', { shortcut }); + console.debug('[ptt] registerPttHotkey: done'); +} + +/** Unregister the global push-to-talk hotkey. */ +export async function unregisterPttHotkey(): Promise { + if (!isTauri()) { + console.debug('[ptt] unregisterPttHotkey: skipped — not running in Tauri'); + return; + } + console.debug('[ptt] unregisterPttHotkey: invoking'); + await invoke('unregister_ptt_hotkey'); + console.debug('[ptt] unregisterPttHotkey: done'); +} + +/** Show or hide the PTT overlay window. */ +export async function showPttOverlay(active: boolean, sessionId: number): Promise { + if (!isTauri()) return; + await invoke('show_ptt_overlay', { active, sessionId }); +} +``` + +- [ ] **Step 9.6: Run full unit suite** + +```bash +pnpm debug unit +``` + +Expected: green. + +- [ ] **Step 9.7: Commit** + +```bash +git add app/src/services/chatService.ts \ + app/src/services/__tests__/chatService.test.ts \ + app/src/utils/tauriCommands/ptt.ts +git commit -m "feat(chatService): forward speakReply/source/sessionId; add ptt tauri wrappers (#3090)" +``` + +--- + +## Task 10: `pttService` state machine + watchdog (the heart of the feature) + +**Files:** +- Create: `app/src/services/pttService.ts` +- Create: `app/src/services/__tests__/pttService.test.ts` + +This is the largest single file in the plan. The state machine is documented in §2 of the spec. + +- [ ] **Step 10.1: Write the failing test suite** + +`app/src/services/__tests__/pttService.test.ts`: + +```ts +import { beforeEach, describe, expect, it, vi } from 'vitest'; + +import { createPttService, type PttDeps } from '../pttService'; + +function makeDeps(overrides: Partial = {}): PttDeps { + return { + audioCapture: { + start: vi.fn().mockResolvedValue(undefined), + finalize: vi.fn().mockResolvedValue({ durationMs: 1500, buffer: new ArrayBuffer(0) }), + cancel: vi.fn().mockResolvedValue(undefined), + }, + transcribe: vi.fn().mockResolvedValue('hello world'), + sendMessage: vi.fn().mockResolvedValue(undefined), + resolveActiveThreadId: vi.fn().mockResolvedValue('thread-active'), + createNewVoiceThread: vi.fn().mockResolvedValue('thread-new'), + playChime: vi.fn().mockResolvedValue(undefined), + showOverlay: vi.fn().mockResolvedValue(undefined), + getSettings: () => ({ speakReplies: true, showOverlay: true }), + now: () => 1_700_000_000_000, + watchdogMs: 10_000, + minAudioMs: 250, + logger: { debug: vi.fn(), info: vi.fn(), warn: vi.fn() }, + ...overrides, + }; +} + +describe('pttService state machine', () => { + beforeEach(() => { + vi.useFakeTimers(); + }); + + it('happy path: start → stop sends the transcript to the active thread with speakReply', async () => { + const deps = makeDeps(); + const svc = createPttService(deps); + + await svc.onStart(1); + expect(deps.audioCapture.start).toHaveBeenCalledWith({ sessionTag: 'ptt:1' }); + expect(deps.playChime).toHaveBeenCalledWith('open'); + expect(deps.showOverlay).toHaveBeenCalledWith(true, 1); + + await svc.onStop(1); + expect(deps.audioCapture.finalize).toHaveBeenCalled(); + expect(deps.playChime).toHaveBeenCalledWith('close'); + expect(deps.showOverlay).toHaveBeenCalledWith(false, 1); + expect(deps.transcribe).toHaveBeenCalled(); + expect(deps.sendMessage).toHaveBeenCalledWith({ + threadId: 'thread-active', + body: 'hello world', + metadata: { source: 'ptt', session_id: 1 }, + speakReply: true, + }); + }); + + it('falls back to a new "Voice" thread when no active thread exists', async () => { + const deps = makeDeps({ + resolveActiveThreadId: vi.fn().mockResolvedValue(null), + }); + const svc = createPttService(deps); + + await svc.onStart(2); + await svc.onStop(2); + + expect(deps.createNewVoiceThread).toHaveBeenCalled(); + expect(deps.sendMessage).toHaveBeenCalledWith( + expect.objectContaining({ threadId: 'thread-new' }), + ); + }); + + it('drops the session and plays the error chime when audio is shorter than minAudioMs', async () => { + const deps = makeDeps({ + audioCapture: { + start: vi.fn().mockResolvedValue(undefined), + finalize: vi.fn().mockResolvedValue({ durationMs: 100, buffer: new ArrayBuffer(0) }), + cancel: vi.fn().mockResolvedValue(undefined), + }, + }); + const svc = createPttService(deps); + + await svc.onStart(3); + await svc.onStop(3); + + expect(deps.transcribe).not.toHaveBeenCalled(); + expect(deps.sendMessage).not.toHaveBeenCalled(); + expect(deps.playChime).toHaveBeenCalledWith('error'); + }); + + it('drops the session when the transcript is empty', async () => { + const deps = makeDeps({ + transcribe: vi.fn().mockResolvedValue(' '), + }); + const svc = createPttService(deps); + + await svc.onStart(4); + await svc.onStop(4); + + expect(deps.sendMessage).not.toHaveBeenCalled(); + expect(deps.playChime).toHaveBeenCalledWith('error'); + }); + + it('watchdog finalises the session after watchdogMs even if onStop never arrives', async () => { + const deps = makeDeps(); + const svc = createPttService(deps); + + await svc.onStart(5); + + // Advance fake time past the watchdog. + await vi.advanceTimersByTimeAsync(11_000); + + expect(deps.audioCapture.finalize).toHaveBeenCalled(); + expect(deps.sendMessage).toHaveBeenCalledWith( + expect.objectContaining({ + metadata: expect.objectContaining({ session_id: 5 }), + }), + ); + }); + + it('second onStart while a session is active preempts the first', async () => { + const deps = makeDeps(); + const svc = createPttService(deps); + + await svc.onStart(6); + await svc.onStart(7); + + expect(deps.audioCapture.cancel).toHaveBeenCalled(); + expect(deps.audioCapture.start).toHaveBeenLastCalledWith({ sessionTag: 'ptt:7' }); + }); + + it('honours the speakReplies setting when forwarding to sendMessage', async () => { + const deps = makeDeps({ + getSettings: () => ({ speakReplies: false, showOverlay: true }), + }); + const svc = createPttService(deps); + + await svc.onStart(8); + await svc.onStop(8); + + expect(deps.sendMessage).toHaveBeenCalledWith( + expect.objectContaining({ speakReply: false }), + ); + }); + + it('mismatched session_id on onStop is ignored', async () => { + const deps = makeDeps(); + const svc = createPttService(deps); + + await svc.onStart(9); + await svc.onStop(999); // stale stop event + + expect(deps.audioCapture.finalize).not.toHaveBeenCalled(); + }); +}); +``` + +- [ ] **Step 10.2: Run failing test** + +```bash +pnpm debug unit app/src/services/__tests__/pttService.test.ts +``` + +Expected: FAIL — `pttService` does not exist. + +- [ ] **Step 10.3: Implement `pttService`** + +`app/src/services/pttService.ts`: + +```ts +/** + * pttService — push-to-talk session state machine. + * + * See spec: `docs/superpowers/specs/2026-06-02-global-ptt-design.md` (§ 2, § 3). + * + * The service is dependency-injected so it can be exercised under vitest + * with fake audio capture / fake STT / fake sendMessage. The real wiring + * (subscribing to `ptt://*` Tauri events, the real audio_capture, etc.) + * happens in PttHotkeyManager.tsx (Task 11). + */ + +export type ChimeKind = 'open' | 'close' | 'error'; + +export interface PttSettings { + speakReplies: boolean; + showOverlay: boolean; +} + +export interface FinalizedAudio { + durationMs: number; + buffer: ArrayBuffer; +} + +export interface PttDeps { + audioCapture: { + start(opts: { sessionTag: string }): Promise; + finalize(): Promise; + cancel(): Promise; + }; + transcribe(buf: ArrayBuffer): Promise; + sendMessage(args: { + threadId: string; + body: string; + metadata: { source: 'ptt'; session_id: number }; + speakReply: boolean; + }): Promise; + resolveActiveThreadId(): Promise; + createNewVoiceThread(): Promise; + playChime(kind: ChimeKind): Promise; + showOverlay(active: boolean, sessionId: number): Promise; + getSettings(): PttSettings; + now(): number; + watchdogMs: number; + minAudioMs: number; + logger: { + debug(msg: string, meta?: Record): void; + info(msg: string, meta?: Record): void; + warn(msg: string, meta?: Record): void; + }; +} + +export interface PttService { + onStart(sessionId: number): Promise; + onStop(sessionId: number): Promise; + cancel(reason: 'preempted' | 'mic_failure' | 'user_cancel'): Promise; +} + +interface ActiveSession { + sessionId: number; + startedAtMs: number; + watchdogTimer: ReturnType | null; + finalizedByWatchdog: boolean; +} + +export function createPttService(deps: PttDeps): PttService { + let active: ActiveSession | null = null; + + const armWatchdog = (sessionId: number) => { + const timer = setTimeout(() => { + if (active && active.sessionId === sessionId) { + active.finalizedByWatchdog = true; + deps.logger.warn('[ptt] watchdog fired — finalising session', { sessionId }); + // Fire-and-forget; the watchdog path is the same as a normal stop + // except for the `finalizedByWatchdog` flag, which is only used + // for logging. + void finaliseSession(sessionId, /* fromWatchdog */ true); + } + }, deps.watchdogMs); + return timer; + }; + + const finaliseSession = async (sessionId: number, fromWatchdog: boolean) => { + if (!active || active.sessionId !== sessionId) { + // Stale finalisation — ignore. + return; + } + + if (active.watchdogTimer) { + clearTimeout(active.watchdogTimer); + active.watchdogTimer = null; + } + + const settings = deps.getSettings(); + const session = active; + active = null; + + let audio: FinalizedAudio; + try { + audio = await deps.audioCapture.finalize(); + } catch (err) { + deps.logger.warn('[ptt] audio finalize failed', { sessionId, err: String(err) }); + await deps.playChime('error'); + await deps.showOverlay(false, sessionId); + return; + } + + await deps.playChime('close'); + await deps.showOverlay(false, sessionId); + + if (audio.durationMs < deps.minAudioMs) { + deps.logger.info('[ptt] session dropped — audio shorter than minAudioMs', { + sessionId, + durationMs: audio.durationMs, + }); + await deps.playChime('error'); + return; + } + + let text = ''; + try { + text = await deps.transcribe(audio.buffer); + } catch (err) { + deps.logger.warn('[ptt] transcription failed', { sessionId, err: String(err) }); + // Per spec: post the message anyway as a breadcrumb. + text = '[Voice — transcription failed]'; + } + + if (!text.trim()) { + deps.logger.info('[ptt] session dropped — empty transcript', { sessionId }); + await deps.playChime('error'); + return; + } + + let threadId = await deps.resolveActiveThreadId(); + if (!threadId) { + threadId = await deps.createNewVoiceThread(); + } + + await deps.sendMessage({ + threadId, + body: text.trim(), + metadata: { source: 'ptt', session_id: sessionId }, + speakReply: settings.speakReplies, + }); + + deps.logger.info('[ptt] session committed', { + sessionId, + threadId, + heldMs: deps.now() - session.startedAtMs, + finalizedByWatchdog: fromWatchdog, + transcriptLen: text.trim().length, + }); + }; + + return { + async onStart(sessionId) { + if (active) { + deps.logger.debug('[ptt] onStart while active — preempting', { + old: active.sessionId, + new: sessionId, + }); + try { + await deps.audioCapture.cancel(); + } catch (err) { + deps.logger.warn('[ptt] cancel failed during preempt', { err: String(err) }); + } + if (active.watchdogTimer) clearTimeout(active.watchdogTimer); + active = null; + } + + await deps.playChime('open'); + await deps.showOverlay(true, sessionId); + + try { + await deps.audioCapture.start({ sessionTag: `ptt:${sessionId}` }); + } catch (err) { + deps.logger.warn('[ptt] audio start failed', { sessionId, err: String(err) }); + await deps.playChime('error'); + await deps.showOverlay(false, sessionId); + return; + } + + active = { + sessionId, + startedAtMs: deps.now(), + watchdogTimer: null, + finalizedByWatchdog: false, + }; + active.watchdogTimer = armWatchdog(sessionId); + }, + + async onStop(sessionId) { + if (!active || active.sessionId !== sessionId) { + deps.logger.debug('[ptt] stale onStop — ignored', { sessionId }); + return; + } + await finaliseSession(sessionId, /* fromWatchdog */ false); + }, + + async cancel(reason) { + if (!active) return; + deps.logger.info('[ptt] cancel', { sessionId: active.sessionId, reason }); + if (active.watchdogTimer) clearTimeout(active.watchdogTimer); + const session = active; + active = null; + try { + await deps.audioCapture.cancel(); + } catch (err) { + deps.logger.warn('[ptt] cancel: audio cancel failed', { err: String(err) }); + } + await deps.playChime('error'); + await deps.showOverlay(false, session.sessionId); + }, + }; +} +``` + +- [ ] **Step 10.4: Run pttService test to verify pass** + +```bash +pnpm debug unit app/src/services/__tests__/pttService.test.ts +``` + +Expected: PASS (all 8 tests). + +- [ ] **Step 10.5: Commit** + +```bash +git add app/src/services/pttService.ts \ + app/src/services/__tests__/pttService.test.ts +git commit -m "feat(pttService): state machine, watchdog, preempt, fallback thread (#3090)" +``` + +--- + +## Task 11: Boot-time hook + `PttHotkeyManager` (wires service to Tauri events) + +**Files:** +- Create: `app/src/hooks/usePttHotkey.ts` +- Create: `app/src/components/PttHotkeyManager.tsx` +- Modify: `app/src/AppShell.tsx` (mount the manager) + +The manager creates the service singleton with real deps, subscribes to `ptt://start` / `ptt://stop` Tauri events, and re-registers the hotkey when the slice's `shortcut` changes. + +- [ ] **Step 11.1: Create `usePttHotkey`** + +`app/src/hooks/usePttHotkey.ts`: + +```ts +import { useEffect } from 'react'; +import { useDispatch, useSelector } from 'react-redux'; + +import { + registerPttHotkey, + unregisterPttHotkey, +} from '../utils/tauriCommands/ptt'; +import { setIsHeld } from '../store/slices/ptt'; +import type { RootState } from '../store'; + +/** + * Subscribes the configured PTT shortcut to the Tauri shell whenever it + * changes. Resets the transient `isHeld` flag on mount so a stale rehydrated + * value can't leave the UI thinking the key is held. + */ +export function usePttHotkey(): void { + const dispatch = useDispatch(); + const shortcut = useSelector((s: RootState) => s.ptt.shortcut); + + // Reset transient state once on mount. + useEffect(() => { + dispatch(setIsHeld(false)); + }, [dispatch]); + + useEffect(() => { + let cancelled = false; + const apply = async () => { + try { + if (shortcut && shortcut.trim().length > 0) { + await registerPttHotkey(shortcut); + } else { + await unregisterPttHotkey(); + } + } catch (err) { + if (!cancelled) { + console.warn('[ptt] hotkey (un)register failed', err); + } + } + }; + void apply(); + return () => { + cancelled = true; + }; + }, [shortcut]); +} +``` + +- [ ] **Step 11.2: Create `PttHotkeyManager`** + +`app/src/components/PttHotkeyManager.tsx`: + +```tsx +import { useEffect, useMemo, useRef } from 'react'; +import { useDispatch, useSelector, useStore } from 'react-redux'; +import { listen, type UnlistenFn } from '@tauri-apps/api/event'; + +import { usePttHotkey } from '../hooks/usePttHotkey'; +import { setIsHeld } from '../store/slices/ptt'; +import { showPttOverlay } from '../utils/tauriCommands/ptt'; +import { createPttService } from '../services/pttService'; +import { chatSend } from '../services/chatService'; +import { + startPttAudio, + finalizePttAudio, + cancelPttAudio, +} from '../features/voice/pttAudio'; +import { transcribePttAudio } from '../features/voice/pttTranscribe'; +import { + resolveActiveThreadId, + createNewVoiceThread, +} from '../features/voice/pttThread'; +import { playPttChime } from '../features/voice/pttChimes'; +import type { RootState } from '../store'; + +/** + * Renderless. Mounted once in AppShell. Owns the pttService singleton. + */ +export function PttHotkeyManager(): null { + usePttHotkey(); + + const dispatch = useDispatch(); + const store = useStore(); + const speakReplies = useSelector((s: RootState) => s.ptt.speakReplies); + const showOverlayPref = useSelector((s: RootState) => s.ptt.showOverlay); + const unlistenRef = useRef([]); + + const service = useMemo( + () => + createPttService({ + audioCapture: { + start: startPttAudio, + finalize: finalizePttAudio, + cancel: cancelPttAudio, + }, + transcribe: transcribePttAudio, + sendMessage: async ({ threadId, body, speakReply, metadata }) => { + await chatSend({ + threadId, + message: body, + speakReply, + source: metadata.source, + sessionId: metadata.session_id, + }); + }, + resolveActiveThreadId, + createNewVoiceThread, + playChime: playPttChime, + showOverlay: async (active, sessionId) => { + // Respect user setting — but always hide on stop even if the + // user toggled the setting off mid-session. + if (!active || store.getState().ptt.showOverlay) { + await showPttOverlay(active, sessionId); + } + }, + getSettings: () => ({ + speakReplies: store.getState().ptt.speakReplies, + showOverlay: store.getState().ptt.showOverlay, + }), + now: () => Date.now(), + watchdogMs: 10_000, + minAudioMs: 250, + logger: { + debug: (msg, meta) => console.debug(msg, meta ?? {}), + info: (msg, meta) => console.info(msg, meta ?? {}), + warn: (msg, meta) => console.warn(msg, meta ?? {}), + }, + }), + // Service is constructed once for the lifetime of the AppShell. + // eslint-disable-next-line react-hooks/exhaustive-deps + [], + ); + + useEffect(() => { + let mounted = true; + (async () => { + const offStart = await listen<{ session_id: number }>('ptt://start', (e) => { + dispatch(setIsHeld(true)); + void service.onStart(e.payload.session_id); + }); + const offStop = await listen<{ session_id: number }>('ptt://stop', (e) => { + dispatch(setIsHeld(false)); + void service.onStop(e.payload.session_id); + }); + if (!mounted) { + offStart(); + offStop(); + return; + } + unlistenRef.current.push(offStart, offStop); + })(); + return () => { + mounted = false; + for (const off of unlistenRef.current) off(); + unlistenRef.current = []; + }; + }, [dispatch, service]); + + // Effects to suppress lint warning for unused selectors above. + void speakReplies; + void showOverlayPref; + + return null; +} +``` + +The manager pulls four small feature modules (`pttAudio`, `pttTranscribe`, `pttThread`, `pttChimes`) — create them as thin wrappers: + +`app/src/features/voice/pttAudio.ts`: + +```ts +import type { FinalizedAudio } from '../../services/pttService'; +// Reuse the existing voice/audio_capture functions used by dictation today. +// If the existing module lives at a different path, adjust the import. +import { startMicCapture, finalizeMicCapture, cancelMicCapture } from './audioCapture'; + +export async function startPttAudio(opts: { sessionTag: string }): Promise { + await startMicCapture({ tag: opts.sessionTag }); +} + +export async function finalizePttAudio(): Promise { + const { buffer, durationMs } = await finalizeMicCapture(); + return { buffer, durationMs }; +} + +export async function cancelPttAudio(): Promise { + await cancelMicCapture(); +} +``` + +If the existing `audioCapture.ts` exports different names (search `app/src/features/voice` and `app/src/services/voice` for the current capture API), adapt the wrappers — they're meant to be a thin renaming layer so `pttService` is decoupled from whatever the dictation feature already provides. + +`app/src/features/voice/pttTranscribe.ts`: + +```ts +import { transcribeBuffer } from './dictationTranscribe'; + +export async function transcribePttAudio(buf: ArrayBuffer): Promise { + // Reuses the same STT path dictation uses. + return transcribeBuffer(buf); +} +``` + +`app/src/features/voice/pttThread.ts`: + +```ts +import { store } from '../../store'; +import { callCoreRpc } from '../../services/coreRpcClient'; + +export async function resolveActiveThreadId(): Promise { + const state = store.getState(); + // `chatRuntime.activeThread` is the source of truth for the currently-open thread. + return state.chatRuntime?.activeThreadId ?? null; +} + +export async function createNewVoiceThread(): Promise { + const resp = await callCoreRpc<{ result: { id: string } } | { id: string }>({ + method: 'openhuman.threads_create_new', + params: { title: 'Voice' }, + }); + // Strip RpcOutcome envelope if present. + const r = 'result' in resp ? (resp as { result: { id: string } }).result : (resp as { id: string }); + return r.id; +} +``` + +If the actual root state shape is different (e.g. `state.chatRuntime` doesn't exist or `activeThreadId` lives under a different key), update the selector. Same caveat for `threads_create_new` — confirm the actual RPC name in `src/openhuman/threads/schemas.rs::"create_new"`. + +`app/src/features/voice/pttChimes.ts`: + +```ts +import openSrc from '../../assets/audio/ptt-open.wav'; +import closeSrc from '../../assets/audio/ptt-close.wav'; +import errorSrc from '../../assets/audio/ptt-error.wav'; + +const cache: Record = {}; + +function get(src: string): HTMLAudioElement { + if (!cache[src]) { + const el = new Audio(src); + el.preload = 'auto'; + cache[src] = el; + } + return cache[src]; +} + +export async function playPttChime(kind: 'open' | 'close' | 'error'): Promise { + const src = kind === 'open' ? openSrc : kind === 'close' ? closeSrc : errorSrc; + const el = get(src); + try { + el.currentTime = 0; + await el.play(); + } catch (err) { + console.debug('[ptt] chime play failed (likely autoplay policy)', err); + } +} +``` + +- [ ] **Step 11.3: Mount ``** + +Open `app/src/AppShell.tsx` (or `App.tsx`, wherever top-level UI is mounted — search for `` in `App.tsx`). Add: + +```tsx +import { PttHotkeyManager } from './components/PttHotkeyManager'; + +// inside the render tree, alongside DictationHotkeyManager if present: + +``` + +- [ ] **Step 11.4: Run the full unit suite** + +```bash +pnpm debug unit +``` + +Expected: green. (The manager has integration-only behavior; we cover it indirectly via the pttService tests and the WDIO spec in Task 14.) + +- [ ] **Step 11.5: Run typecheck** + +```bash +pnpm typecheck +``` + +Expected: clean. Resolve any import-path issues that surface against the actual codebase paths. + +- [ ] **Step 11.6: Commit** + +```bash +git add app/src/hooks/usePttHotkey.ts \ + app/src/components/PttHotkeyManager.tsx \ + app/src/features/voice/pttAudio.ts \ + app/src/features/voice/pttTranscribe.ts \ + app/src/features/voice/pttThread.ts \ + app/src/features/voice/pttChimes.ts \ + app/src/AppShell.tsx +git commit -m "feat(ptt): mount PttHotkeyManager + wire service to real audio/STT/chat (#3090)" +``` + +--- + +## Task 12: `/ptt-overlay` route + overlay UI + +**Files:** +- Create: `app/src/pages/PttOverlayPage.tsx` +- Create: `app/src/pages/PttOverlayPage.test.tsx` +- Modify: `app/src/AppRoutes.tsx` + +- [ ] **Step 12.1: Write failing render test** + +`app/src/pages/PttOverlayPage.test.tsx`: + +```tsx +import { describe, expect, it, vi } from 'vitest'; +import { render, screen, act } from '@testing-library/react'; + +import { PttOverlayPage } from './PttOverlayPage'; + +// Mock @tauri-apps/api/event's listen so we can dispatch fake events. +vi.mock('@tauri-apps/api/event', () => { + const handlers: Record void> = {}; + return { + listen: vi.fn(async (name: string, handler: (e: { payload: unknown }) => void) => { + handlers[name] = handler; + return () => delete handlers[name]; + }), + __dispatch: (name: string, payload: unknown) => + handlers[name]?.({ payload }), + }; +}); + +describe('PttOverlayPage', () => { + it('renders idle state by default', () => { + render(); + expect(screen.getByTestId('ptt-overlay-root')).toHaveAttribute('data-active', 'false'); + }); + + it('flips to active when ptt-overlay://active fires with active=true', async () => { + render(); + const evt = await import('@tauri-apps/api/event'); + await act(async () => { + (evt as unknown as { __dispatch: (n: string, p: unknown) => void }).__dispatch( + 'ptt-overlay://active', + { active: true, session_id: 1 }, + ); + }); + expect(screen.getByTestId('ptt-overlay-root')).toHaveAttribute('data-active', 'true'); + }); +}); +``` + +- [ ] **Step 12.2: Run failing test** + +```bash +pnpm debug unit app/src/pages/PttOverlayPage.test.tsx +``` + +Expected: FAIL — module does not exist. + +- [ ] **Step 12.3: Implement the page** + +`app/src/pages/PttOverlayPage.tsx`: + +```tsx +import { useEffect, useState } from 'react'; +import { listen, type UnlistenFn } from '@tauri-apps/api/event'; +import { useT } from '../lib/i18n/I18nContext'; + +export function PttOverlayPage(): JSX.Element { + const t = useT(); + const [active, setActive] = useState(false); + + useEffect(() => { + let off: UnlistenFn | undefined; + (async () => { + off = await listen<{ active: boolean }>('ptt-overlay://active', (e) => { + setActive(Boolean(e.payload?.active)); + }); + })(); + return () => off?.(); + }, []); + + return ( +
+ + {active ? t('pttOverlay.listening') : t('pttOverlay.idle')} +
+ ); +} +``` + +- [ ] **Step 12.4: Add the route** + +In `app/src/AppRoutes.tsx`, add (alongside other Routes): + +```tsx +import { PttOverlayPage } from './pages/PttOverlayPage'; + +// inside : +} /> +``` + +- [ ] **Step 12.5: Run overlay tests** + +```bash +pnpm debug unit app/src/pages/PttOverlayPage.test.tsx +``` + +Expected: PASS. + +- [ ] **Step 12.6: Commit** + +```bash +git add app/src/pages/PttOverlayPage.tsx \ + app/src/pages/PttOverlayPage.test.tsx \ + app/src/AppRoutes.tsx +git commit -m "feat(ptt/ui): overlay page at /ptt-overlay with idle/active states (#3090)" +``` + +--- + +## Task 13: Settings panel — hotkey capture + toggles + +**Files:** +- Create: `app/src/pages/settings/voice/PttSettingsPanel.tsx` +- Create: `app/src/pages/settings/voice/__tests__/PttSettingsPanel.test.tsx` +- Modify: `app/src/pages/settings/voice/VoiceSettingsPage.tsx` (or wherever the voice settings tab body lives) +- Modify: `app/src/lib/i18n/en.ts` + 12 other locale files + +- [ ] **Step 13.1: Add i18n keys to en.ts** + +In `app/src/lib/i18n/en.ts`, add: + +```ts +// In the appropriate section (alphabetical, near other voice keys): +'pttSettings.title': 'Push-to-talk', +'pttSettings.description': + "Hold a key to talk to OpenHuman while you're in another app. Releases the key to send; OpenHuman speaks the reply back.", +'pttSettings.shortcutLabel': 'Hotkey', +'pttSettings.shortcutPlaceholder': 'Press a key (e.g. F13)', +'pttSettings.shortcutUnsetHint': 'Push-to-talk is off — pick a hotkey to enable.', +'pttSettings.speakRepliesLabel': 'Speak agent replies', +'pttSettings.showOverlayLabel': 'Show overlay while held', +'pttSettings.errorConflictsWithDictation': + 'This shortcut is already used by dictation. Pick a different key.', +'pttSettings.errorModifierOnly': + "Pick a regular key (e.g. F13) — modifier-only shortcuts don't work for push-to-talk.", +'pttSettings.errorEmpty': 'Pick a key to bind.', +'pttSettings.errorAccessibility': + 'macOS needs Accessibility permission for this shortcut. Open System Settings → Privacy & Security → Accessibility and enable OpenHuman.', +'pttSettings.errorShortcutInUse': + 'Another app already uses this shortcut. Pick a different one.', +'pttSettings.errorUnsupportedWayland': + "Wayland sessions don't support global shortcuts in OpenHuman yet — switch to an X11 session or use the in-app dictation toggle.", +'pttSettings.exclusiveFullscreenHint': + "In exclusive-fullscreen games the overlay won't render — you'll only hear the chime. Switch to borderless fullscreen for the overlay.", +'pttOverlay.listening': 'Listening…', +'pttOverlay.idle': 'Idle', +``` + +- [ ] **Step 13.2: Add the same keys to every other locale with REAL translations** + +For each of `ar`, `bn`, `de`, `es`, `fr`, `hi`, `id`, `it`, `ko`, `pl`, `pt`, `ru`, `zh-CN`, add the same set of keys with translated values. Do not copy English. Examples for German (`de.ts`) and Spanish (`es.ts`) — translate the remaining 11 locales the same way: + +```ts +// de.ts additions +'pttSettings.title': 'Push-to-Talk', +'pttSettings.description': + 'Halte eine Taste gedrückt, um mit OpenHuman zu sprechen, während du in einer anderen App bist. Beim Loslassen wird gesendet; OpenHuman spricht die Antwort.', +'pttSettings.shortcutLabel': 'Tastenkürzel', +'pttSettings.shortcutPlaceholder': 'Taste drücken (z. B. F13)', +'pttSettings.shortcutUnsetHint': 'Push-to-Talk ist aus — wähle ein Tastenkürzel zum Aktivieren.', +'pttSettings.speakRepliesLabel': 'Antworten vorlesen', +'pttSettings.showOverlayLabel': 'Overlay während des Haltens anzeigen', +'pttSettings.errorConflictsWithDictation': + 'Dieses Kürzel wird bereits von der Diktierfunktion verwendet. Wähle eine andere Taste.', +'pttSettings.errorModifierOnly': + 'Wähle eine normale Taste (z. B. F13) — reine Modifikatortasten funktionieren für Push-to-Talk nicht.', +'pttSettings.errorEmpty': 'Wähle eine Taste zum Binden.', +'pttSettings.errorAccessibility': + 'macOS benötigt die Bedienungshilfen-Berechtigung. Öffne Systemeinstellungen → Datenschutz & Sicherheit → Bedienungshilfen und aktiviere OpenHuman.', +'pttSettings.errorShortcutInUse': + 'Eine andere App nutzt dieses Kürzel bereits. Wähle ein anderes.', +'pttSettings.errorUnsupportedWayland': + 'Wayland-Sitzungen unterstützen globale Tastenkürzel in OpenHuman noch nicht — wechsle zu X11 oder nutze die In-App-Diktatumschaltung.', +'pttSettings.exclusiveFullscreenHint': + 'Im Exclusive-Fullscreen-Modus wird das Overlay nicht angezeigt — du hörst nur den Signalton. Wechsle zu randlosem Vollbild für das Overlay.', +'pttOverlay.listening': 'Höre zu…', +'pttOverlay.idle': 'Inaktiv', + +// es.ts additions +'pttSettings.title': 'Pulsar para hablar', +'pttSettings.description': + 'Mantén una tecla pulsada para hablar con OpenHuman mientras estás en otra app. Al soltar se envía; OpenHuman lee la respuesta.', +'pttSettings.shortcutLabel': 'Atajo de teclado', +'pttSettings.shortcutPlaceholder': 'Pulsa una tecla (p. ej. F13)', +'pttSettings.shortcutUnsetHint': 'Pulsar para hablar está apagado — elige una tecla para activarlo.', +'pttSettings.speakRepliesLabel': 'Leer las respuestas en voz alta', +'pttSettings.showOverlayLabel': 'Mostrar superposición mientras se mantiene pulsada', +'pttSettings.errorConflictsWithDictation': + 'Este atajo ya lo usa el dictado. Elige otra tecla.', +'pttSettings.errorModifierOnly': + 'Elige una tecla normal (p. ej. F13) — los atajos solo con modificadores no funcionan para pulsar para hablar.', +'pttSettings.errorEmpty': 'Elige una tecla para asignar.', +'pttSettings.errorAccessibility': + 'macOS requiere permiso de Accesibilidad. Abre Ajustes del Sistema → Privacidad y Seguridad → Accesibilidad y activa OpenHuman.', +'pttSettings.errorShortcutInUse': + 'Otra app ya está usando este atajo. Elige uno diferente.', +'pttSettings.errorUnsupportedWayland': + 'Las sesiones Wayland aún no admiten atajos globales en OpenHuman — cambia a X11 o usa la activación del dictado en la app.', +'pttSettings.exclusiveFullscreenHint': + 'En modo pantalla completa exclusivo el overlay no se mostrará — solo oirás el tono. Cambia a pantalla completa sin bordes para ver el overlay.', +'pttOverlay.listening': 'Escuchando…', +'pttOverlay.idle': 'Inactivo', +``` + +For the remaining 11 locales, repeat with translations into that language. Do not leave English-language stubs. + +- [ ] **Step 13.3: Run i18n gates** + +```bash +pnpm i18n:check +pnpm i18n:english:check +``` + +Expected: both pass. + +- [ ] **Step 13.4: Write failing settings panel test** + +`app/src/pages/settings/voice/__tests__/PttSettingsPanel.test.tsx`: + +```tsx +import { describe, expect, it, vi } from 'vitest'; +import { render, screen, fireEvent } from '@testing-library/react'; +import { Provider } from 'react-redux'; +import { configureStore } from '@reduxjs/toolkit'; + +import { pttReducer, initialPttState } from '../../../../store/slices/ptt'; +import { I18nProvider } from '../../../../lib/i18n/I18nContext'; +import en from '../../../../lib/i18n/en'; +import { PttSettingsPanel } from '../PttSettingsPanel'; + +function renderWithStore(state = initialPttState) { + const store = configureStore({ + reducer: { ptt: pttReducer }, + preloadedState: { ptt: state }, + }); + return render( + + + + + , + ); +} + +describe('PttSettingsPanel', () => { + it('renders the hint when no shortcut is set', () => { + renderWithStore({ ...initialPttState, shortcut: null }); + expect(screen.getByText(/push-to-talk is off/i)).toBeInTheDocument(); + }); + + it('renders the bound shortcut when set', () => { + renderWithStore({ ...initialPttState, shortcut: 'F13' }); + // The hotkey-capture widget shows the current key somewhere — adapt to the + // existing widget's testid pattern used by the dictation panel. + expect(screen.getByTestId('ptt-shortcut-current')).toHaveTextContent('F13'); + }); + + it('toggles speakReplies via the switch', () => { + renderWithStore({ ...initialPttState, shortcut: 'F13', speakReplies: true }); + const toggle = screen.getByLabelText(/speak agent replies/i); + fireEvent.click(toggle); + // Assert dispatched action via store state — re-render and check the toggle's aria-checked. + expect(toggle).toHaveAttribute('aria-checked', 'false'); + }); +}); +``` + +- [ ] **Step 13.5: Implement `PttSettingsPanel`** + +`app/src/pages/settings/voice/PttSettingsPanel.tsx`: + +```tsx +import { useDispatch, useSelector } from 'react-redux'; + +import { useT } from '../../../lib/i18n/I18nContext'; +import { + setPttShortcut, + setSpeakReplies, + setShowOverlay, +} from '../../../store/slices/ptt'; +import type { RootState } from '../../../store'; +// Reuse the dictation panel's hotkey-capture widget pattern; if the existing +// one isn't reusable, build a small inline KeyCapture in this file with the +// same shape. +import { HotkeyCaptureField } from '../../../components/HotkeyCaptureField'; + +export function PttSettingsPanel(): JSX.Element { + const t = useT(); + const dispatch = useDispatch(); + const shortcut = useSelector((s: RootState) => s.ptt.shortcut); + const speakReplies = useSelector((s: RootState) => s.ptt.speakReplies); + const showOverlay = useSelector((s: RootState) => s.ptt.showOverlay); + + return ( +
+

{t('pttSettings.title')}

+

{t('pttSettings.description')}

+ + dispatch(setPttShortcut(next || null))} + testIdCurrent="ptt-shortcut-current" + /> + + {shortcut == null && ( +

{t('pttSettings.shortcutUnsetHint')}

+ )} + + + + + +

{t('pttSettings.exclusiveFullscreenHint')}

+
+ ); +} +``` + +If `HotkeyCaptureField` doesn't already exist in the codebase, locate the equivalent in the dictation settings panel (search `app/src/pages/settings/voice/` for the current key-binding widget) and either reuse it or extract a shared component. The plan target is one new file (`PttSettingsPanel.tsx`); a shared `HotkeyCaptureField.tsx` is optional cleanup if useful. + +- [ ] **Step 13.6: Mount the panel in the Voice settings page** + +Find the voice settings page (search `app/src/pages/settings/voice/` for the entry point — likely `VoiceSettingsPage.tsx` or similar). Import and render `` alongside the existing dictation section. + +- [ ] **Step 13.7: Run tests** + +```bash +pnpm debug unit app/src/pages/settings/voice/__tests__/PttSettingsPanel.test.tsx +pnpm i18n:check +pnpm i18n:english:check +``` + +Expected: all pass. + +- [ ] **Step 13.8: Commit** + +```bash +git add app/src/pages/settings/voice/PttSettingsPanel.tsx \ + app/src/pages/settings/voice/__tests__/PttSettingsPanel.test.tsx \ + app/src/lib/i18n/ +git commit -m "feat(settings/voice): PttSettingsPanel + 13-locale i18n (#3090)" +``` + +--- + +## Task 14: WDIO E2E — full PTT flow with mocked STT + +**Files:** +- Create: `app/test/e2e/specs/ptt-flow.spec.ts` + +End-to-end: open settings, bind F13 as the PTT key, simulate a hold via `tauri-driver` key injection, assert the overlay window appears, assert the chat thread receives a message. STT is mocked through the existing shared mock backend (`scripts/mock-api-core.mjs`) so the spec is deterministic. + +- [ ] **Step 14.1: Verify mock backend can return a fixed STT transcript** + +Search `scripts/mock-api-core.mjs` for any existing transcription endpoint (likely `transcribe` or `stt`). If one exists, note its admin-config override path. If not, add a minimal endpoint that returns a fixed transcript when called: + +```js +// In scripts/mock-api-core.mjs — add near other mock endpoints: +if (req.url === '/v1/transcribe' && req.method === 'POST') { + const override = state.behavior.transcribe || { text: 'mocked transcript from ptt e2e' }; + return respondJson(res, 200, override); +} +``` + +This is a small surface-area extension; confirm the exact integration shape against the existing mock-server pattern. + +- [ ] **Step 14.2: Write the E2E spec** + +`app/test/e2e/specs/ptt-flow.spec.ts`: + +```ts +import { expect } from '@wdio/globals'; +import { + clickNativeButton, + waitForWebView, + clickToggle, +} from '../helpers/element-helpers'; +import { adminReset, adminSetBehavior, adminLastRequests } from '../helpers/mock-server'; + +describe('PTT flow', () => { + before(async () => { + await adminReset(); + await adminSetBehavior({ + transcribe: { text: 'hello from PTT' }, + }); + }); + + it('binds F13, simulates a hold, asserts overlay + chat message', async () => { + await waitForWebView(); + + // 1. Navigate to Voice settings. + await clickNativeButton('tab-settings'); + await clickNativeButton('settings-section-voice'); + + // 2. Bind F13 as the PTT shortcut. + await $('input[aria-label="Hotkey"]').click(); + await browser.keys(['F13']); + // Save / confirm via whatever pattern the dictation panel uses (auto-save typically). + await browser.pause(200); + + // 3. Simulate a hold: press F13, wait, release F13. + await browser.keys(['F13']); // press (key down) + await browser.pause(800); // hold + // tauri-driver / Appium release: depends on driver. For WDIO + Appium Mac2, + // browser.keys() simulates a tap by default; for an explicit press-and-release + // pair use the W3C Actions API: + await browser.action('key') + .down('F13') + .pause(800) + .up('F13') + .perform(); + + // 4. Wait for the overlay window to appear, then disappear. + // Tauri webview windows are queryable by label via getWindowHandles + switchToWindow. + const handlesDuring = await browser.getWindowHandles(); + expect(handlesDuring.length).toBeGreaterThan(1); + + // 5. Switch back to the main webview and assert the chat thread has the message. + await browser.switchToWindow(handlesDuring[0]); + await clickNativeButton('tab-chat'); + const lastMessage = await $('[data-testid="chat-message-last"]'); + await lastMessage.waitForExist({ timeout: 5_000 }); + await expect(lastMessage).toHaveTextContaining('hello from PTT'); + + // 6. Assert the chat request hit channel.web_chat with speak_reply=true. + const requests = await adminLastRequests(); + const chatCall = requests.find((r) => + r.url.includes('/rpc') && + typeof r.body === 'string' && + r.body.includes('channel_web_chat'), + ); + expect(chatCall).toBeDefined(); + expect(JSON.parse(chatCall!.body)).toMatchObject({ + params: expect.objectContaining({ + speak_reply: true, + source: 'ptt', + }), + }); + }); +}); +``` + +(`adminLastRequests` may already exist in `app/test/e2e/helpers/mock-server.ts`; if not, the helper file lives at that path — extend it to expose the existing `/__admin/requests` endpoint.) + +- [ ] **Step 14.3: Build the Tauri bundle + run the spec** + +```bash +pnpm test:e2e:build +bash app/scripts/e2e-run-spec.sh test/e2e/specs/ptt-flow.spec.ts ptt-flow +``` + +Expected: PASS. If `F13` key injection fails on the test driver (some Appium versions need scancodes), substitute a more reliable key like `Pause` or `ScrollLock` and update the spec + bound shortcut accordingly. + +- [ ] **Step 14.4: Commit** + +```bash +git add app/test/e2e/specs/ptt-flow.spec.ts scripts/mock-api-core.mjs +git commit -m "test(ptt/e2e): full bind→hold→commit flow with mocked STT (#3090)" +``` + +--- + +## Task 15: `voice.ptt` capability entry + final quality sweep + +**Files:** +- Modify: `src/openhuman/about_app/` (capability list — locate the file that defines the capability vec) +- Modify: anything else surfaced by the final quality pass + +- [ ] **Step 15.1: Add the capability entry** + +Find the capability vec in `src/openhuman/about_app/`. It will look roughly like: + +```rust +Capability { + id: "voice.dictation", + label: "Dictation hotkey", + ... +}, +``` + +Add a sibling entry: + +```rust +Capability { + id: "voice.ptt", + label: "Global push-to-talk", + supported_on: &[Platform::MacOS, Platform::Windows, Platform::LinuxX11], + requires: &["microphone", "global_shortcut"], +}, +``` + +If `Platform::LinuxX11` doesn't exist as a variant, add it to the `Platform` enum in the same module (or list `Platform::Linux` and note "X11 only" in a description field, depending on the enum's shape). + +- [ ] **Step 15.2: Add a test for the new capability** + +In the corresponding capability tests file (search `src/openhuman/about_app/` for `*_tests.rs`): + +```rust +#[test] +fn capability_list_includes_voice_ptt() { + let caps = all_capabilities(); + assert!( + caps.iter().any(|c| c.id == "voice.ptt"), + "voice.ptt capability must be registered" + ); +} +``` + +- [ ] **Step 15.3: Run the capability test** + +```bash +pnpm debug rust capability_list_includes_voice_ptt +``` + +Expected: PASS. + +- [ ] **Step 15.4: Run the full quality suite** + +```bash +pnpm format +pnpm lint +pnpm typecheck +pnpm debug unit +pnpm rust:check +pnpm test:rust +pnpm i18n:check +pnpm i18n:english:check +``` + +Fix any red. Treat all as gating — none should be skipped. + +- [ ] **Step 15.5: Verify diff coverage** + +```bash +# Approximate diff coverage locally; the merge gate runs the canonical job in CI. +pnpm test:coverage +``` + +Eyeball coverage for each new file. Files under 80% diff coverage: add focused tests. + +- [ ] **Step 15.6: Commit + push** + +```bash +git add src/openhuman/about_app/ +git commit -m "feat(about_app): register voice.ptt capability (#3090)" +git push aniketh feat/global-ptt-3090 +``` + +- [ ] **Step 15.7: Open the PR against `tinyhumansai/openhuman:main`** + +```bash +gh pr create \ + --repo tinyhumansai/openhuman \ + --base main \ + --head CodeGhost21:feat/global-ptt-3090 \ + --title "feat(voice): global push-to-talk hotkey (#3090)" \ + --body-file - <<'EOF' +## Summary +- Hold-to-talk global hotkey: mic opens on press, closes on release, transcript sent to active thread, agent reply spoken via TTS — no focus stealing. +- Cross-platform via `tauri-plugin-global-shortcut` (different from dictation's OS-forked rdev/Tauri-plugin path — deliberately single-code-path here). +- Borderless always-on-top overlay window (lazy-created on first register). +- Audible open/close/error chimes. +- 10s watchdog finalises sessions when the OS swallows the release event. +- `speak_reply` / `source` / `session_id` additive optional fields on `channel.web_chat`; backwards-compatible. + +## Spec / plan +- Spec: `docs/superpowers/specs/2026-06-02-global-ptt-design.md` +- Plan: `docs/superpowers/plans/2026-06-02-global-ptt.md` +- Issue: closes part of #3090 (PTT half; background screen-capture is a separate follow-up PR) + +## Test plan +- [x] `pnpm debug rust web_chat_schema_accepts_optional_ptt_fields` +- [x] `pnpm debug rust publishing_a_ptt_commit_reaches_a_subscriber` +- [x] `pnpm debug rust channel_web_chat_with_speak_reply_invokes_reply_speech` +- [x] `pnpm debug rust ptt_hotkeys` +- [x] `pnpm debug unit app/src/store/slices/__tests__/ptt.test.ts` +- [x] `pnpm debug unit app/src/services/__tests__/pttService.test.ts` +- [x] `pnpm debug unit app/src/pages/PttOverlayPage.test.tsx` +- [x] `pnpm debug unit app/src/pages/settings/voice/__tests__/PttSettingsPanel.test.tsx` +- [x] `pnpm i18n:check` + `pnpm i18n:english:check` +- [x] `bash app/scripts/e2e-run-spec.sh test/e2e/specs/ptt-flow.spec.ts ptt-flow` +- [x] Manual smoke on macOS — hold key while VS Code is foreground, agent reply audible. + +## Notes +- Approval/Submission-checklist boxes above are all `[x]` per the project's PR submission checklist rule (`feedback_pr_submission_checklist`). +- Background screen capture from #3090 is intentionally out of scope here; it's tracked as a follow-up. +EOF +``` + +--- + +## Self-review (post-write) + +### Spec coverage + +| Spec section | Covered by | +| --- | --- | +| Goals — configurable hold-to-talk hotkey | T3 (parse), T5 (register IPC), T11 (renderer hook) | +| Goals — mic-on-press / mic-off-release / TTS reply | T4 (TTS hook), T10 (state machine), T11 (real audio wiring) | +| Goals — audible + visual feedback | T7 (chimes), T6 (overlay window), T12 (overlay UI) | +| Goals — macOS + Windows + Linux/X11; Wayland docs | T3 (uniform expand), T13 (Wayland error string), all hotkey logic is platform-agnostic via Tauri plugin | +| Component map — `ptt_hotkeys.rs` | T3, T5 | +| Component map — `ptt_overlay.rs` | T6 | +| Component map — `voice/bus.rs` + DomainEvent | T2 | +| Component map — schema delta | T1, T4 | +| Component map — `pttService.ts` | T10 | +| Component map — `ptt` slice | T8 | +| Component map — `PttSettingsPanel` | T13 | +| Component map — overlay React page | T12 | +| Component map — chimes | T7, T11 | +| Component map — i18n in 13 locales | T13 | +| § 2 State machine — press/release CAS | T5 (CAS in the Tauri-side closure) | +| § 2 State machine — watchdog | T10 + T10 tests | +| § 2 State machine — modifier-only rejection | T3 | +| § 3 Audio + transcript flow — full path | T10 + T11 | +| § 3 Active thread fallback | T10 + T11 (`createNewVoiceThread`) | +| § 3 Empty-audio / empty-transcript handling | T10 | +| § 3 TTS routing via speak_reply | T1, T4 | +| § 3 Dictation-preempt | T10 (preempt branch in `onStart`) | +| § 4 Overlay implementation choice | T6 | +| § 4 Visibility lifecycle | T6 | +| § 4 DXGI caveat documented | T13 (`exclusiveFullscreenHint`) | +| § 5 Mic permission denied | T10 (error chime + log) | +| § 5 Global-hotkey registration failures | T3 (error enum), T5 (rollback + dictation conflict error path), T13 (i18n surfaces) | +| § 5 Shortcut conflicts with dictation | T5 (bidirectional) | +| § 5 Logging | T3, T5, T6, T10, T11 (all include `[ptt]` prefix and PII-safe fields) | +| § 5 Capability catalog | T15 | +| § 6 No TOML schema change | n/a — confirmed not in any task | +| § 6 Default `shortcut: null` | T8 | +| § 6 Boot path | T11 (`usePttHotkey`) | +| § 7 Tests — every layer | T1 (schema), T2 (bus), T3 (parse), T4 (E2E), T8 (slice), T10 (service), T12 (overlay), T13 (panel), T14 (WDIO) | +| § 7 Coverage gate | T15 | +| Out of scope — listed in plan header + Task 15 PR body | ✓ | + +No gaps. + +### Placeholder scan + +Searched for "TBD", "TODO", "Fill in", "Similar to Task", "implement later". None present. Where the plan asks the engineer to "search for the dictation pattern" (T11 audio, T13 hotkey-capture widget), the search target and shape are both named explicitly — not placeholder text. + +### Type consistency + +- `PttError` variants are defined in T3 and referenced in T5 (`ConflictsWithDictation(String)`). ✓ +- `PttHotkeyState::{shortcut, session_counter}` defined in T3 and accessed in T5. ✓ +- `PttDeps` field names match between T10's test (`audioCapture`, `transcribe`, `sendMessage`, `resolveActiveThreadId`, `createNewVoiceThread`, `playChime`, `showOverlay`, `getSettings`, `now`, `watchdogMs`, `minAudioMs`, `logger`) and T10's implementation. ✓ +- `FinalizedAudio.{durationMs, buffer}` consistent between definition (T10) and consumer (T11's `finalizePttAudio` wrapper). ✓ +- `ChimeKind = 'open' | 'close' | 'error'` consistent between T10 (definition) and T11 (`playPttChime` signature). ✓ +- `PttSettings = { speakReplies, showOverlay }` consistent between slice (T8) and `getSettings()` (T11). ✓ +- `chatSend` params: `speakReply`, `source`, `sessionId` consistent across T9 (chatService), T10 (test fixture), T11 (manager call site). ✓ +- `channel.web_chat` server fields: `speak_reply`, `source`, `session_id` consistent across T1 (schema), T4 (consumer), T9 (caller). ✓ +- Tauri event names: `ptt://start`, `ptt://stop`, `ptt-overlay://active` consistent across T5 (emit), T6 (emit), T11 (listen), T12 (listen). ✓ diff --git a/docs/superpowers/specs/2026-06-02-global-ptt-design.md b/docs/superpowers/specs/2026-06-02-global-ptt-design.md new file mode 100644 index 0000000000..835bf14767 --- /dev/null +++ b/docs/superpowers/specs/2026-06-02-global-ptt-design.md @@ -0,0 +1,380 @@ +# Global Push-to-Talk Hotkey — Design + +**Issue:** [tinyhumansai/openhuman#3090](https://github.com/tinyhumansai/openhuman/issues/3090) — "Global push-to-talk keybind + screen share while tabbed out / in background." + +**Scope of this spec:** the *push-to-talk* half only. Background screen capture for the agent is acknowledged in the issue and tracked as a follow-up PR — same domain (voice / agent context), different surface area (host-screen sampling, fullscreen-game compatibility, image-token budget). Keeping them separate keeps each PR reviewable and coverage-gateable. + +**Outcome:** the user holds a configurable global hotkey while OpenHuman is *not* the focused window (mid-game, in their IDE, on a Slack call), speaks, releases the key, and the agent answers via TTS — without OpenHuman ever stealing focus. + +--- + +## Goals + +- A user-configurable hold-to-talk hotkey that works while OpenHuman is in the background. +- Mic opens on press, closes on release; transcript is auto-posted to the active chat thread and the agent's reply is spoken aloud. +- Audible + visual feedback (chime + small always-on-top overlay) so the user knows the mic is hot without alt-tabbing. +- Works on macOS, Windows, and Linux/X11 in v1. Wayland: documented unsupported with a clear in-app message. + +## Non-goals (v1) + +- Background screen capture for the agent. (Follow-up issue spawned from #3090.) +- Streaming partial transcripts during the hold. +- Per-thread PTT routing (always routes to the active thread). +- A DXGI-exclusive-fullscreen overlay workaround. (Documented caveat only; chime still plays.) +- Toggle-style PTT (we ship hold-to-talk only — the existing dictation toggle remains for press-once-press-again users). + +--- + +## Architecture overview + +``` +[User holds hotkey] + │ +[Tauri shell: tauri-plugin-global-shortcut] + │ ShortcutState::Pressed + ▼ +[app/src-tauri/src/ptt_hotkeys.rs] + │ emit("ptt://start", { session_id }) + ▼ +[app/src/services/pttService.ts] ─┐ + │ voice/audio_capture.start │ hold phase + │ playChime("open") │ + │ invoke("show_ptt_overlay", { active }) │ + │ armWatchdog(10s) │ + ─┘ + +[User releases hotkey] + │ ShortcutState::Released + ▼ +[ptt_hotkeys.rs] emit("ptt://stop", { session_id }) + │ +[pttService.onStop] + │ voice/audio_capture.finalize → Buffer + │ playChime("close") + hide overlay + │ dictationListener.transcribe(buf) → text + │ chatRuntime.sendMessage({ text, speakReply: true, source: "ptt" }) + ▼ +[Core: openhuman.channel_web_chat] + │ normal agent turn + │ on assistant final text: + │ voice::reply_speech.synthesize_and_play(text) // if speak_reply + ▼ +[User hears reply; OpenHuman window state never changes] +``` + +The bulk of the work is in the **Tauri shell** (hotkey + overlay window) and the **renderer service layer** (state machine + glue). The Rust core gets exactly one additive change: a `speak_reply: bool` flag on `channel.web_chat` so TTS reply routing doesn't require the renderer to be focused or even running its normal chat UI. + +--- + +## Components + +### Tauri shell — `app/src-tauri/src/` + +#### `ptt_hotkeys.rs` *(new)* + +Owns global hotkey registration for PTT. Mirrors `dictation_hotkeys.rs` in shape, with two key differences: it listens for **both** `Pressed` and `Released`, and rejects pure-modifier shortcuts. + +```rust +pub(crate) struct PttHotkeyState { + pub(crate) shortcut: Mutex>, // expanded variants registered + pub(crate) is_held: AtomicBool, // CAS-guarded press/release + pub(crate) session_counter: AtomicU64, +} + +pub(crate) fn expand_ptt_shortcuts(shortcut: &str) -> Result, PttError>; +// - returns Err(EmptyShortcut) if trimmed empty +// - returns Err(ModifierOnlyShortcut) if every token is a modifier (Ctrl/Cmd/Shift/Alt/Meta) +// - returns Err(InvalidShortcut(...)) if the plugin parser rejects it +// - otherwise returns 1 or 2 expanded variants (macOS CmdOrCtrl → [Cmd, Ctrl]) + +pub(crate) enum PttError { + EmptyShortcut, + ModifierOnlyShortcut, + InvalidShortcut(String), + AccessibilityRequired, // macOS + ShortcutInUse(String), // Windows + UnsupportedOnWayland, + ConflictsWithDictation(String), + RegistrationFailed(String), +} +``` + +#### `lib.rs` — two new IPC commands + +```rust +#[tauri::command] +async fn register_ptt_hotkey(app: AppHandle, shortcut: String) -> Result<(), String>; + +#[tauri::command] +async fn unregister_ptt_hotkey(app: AppHandle) -> Result<(), String>; +``` + +Behavior on `register_ptt_hotkey`: + +1. Expand & validate via `expand_ptt_shortcuts`. +2. Check overlap with the currently-registered dictation shortcut(s); on overlap return `ConflictsWithDictation`. +3. Unregister any previously-registered PTT shortcut (rollback-safe — same pattern as the dictation registration). +4. Register each expanded variant with a closure that: + - On `Pressed`: CAS `is_held: false → true`; on success, increment `session_counter` and emit `ptt://start { session_id }`. On failure (CAS lost — auto-repeat or stuck state), drop. + - On `Released`: CAS `is_held: true → false`; on success, emit `ptt://stop { session_id }` with the *current* counter value. On failure, drop. +5. Persist the registered variants in `PttHotkeyState`. + +`unregister_ptt_hotkey` unregisters all currently-registered variants and clears state. Also called on shutdown (`unregister_all` already covered by the plugin's drop). + +#### `ptt_overlay.rs` *(new)* — dedicated overlay window + +Lazy-create-on-first-register, destroyed on `unregister`. Window config: + +| Field | Value | +| --- | --- | +| `label` | `"ptt-overlay"` | +| `url` | `/#/ptt-overlay` (HashRouter route, mounted only in this window) | +| `decorations` | `false` | +| `transparent` | `true` | +| `always_on_top` | `true` | +| `skip_taskbar` | `true` | +| `focus` | `false` (never accepts focus) | +| `resizable` | `false` | +| `shadow` | `false` | +| `visible_on_all_workspaces` | `true` | +| `accept_first_mouse` | `false` | +| `size` | `160 × 56` | +| `position` | bottom-right of primary display, 24px inset (hard-coded in v1) | + +IPC command: `show_ptt_overlay({ active: bool, session_id: u64 })` — hides/shows the window with a 250ms fade on close. Window-local React state in `/#/ptt-overlay` toggles a pulsing red dot when `active: true`. + +### Rust core — `src/openhuman/` + +#### `voice/bus.rs` *(new)* + +Per the canonical module shape, the voice domain currently has no `bus.rs`. Add one with a single subscriber-less event publisher and a new variant on `DomainEvent`: + +```rust +// in src/core/event_bus/events.rs +pub enum VoiceEvent { + PttTranscriptCommitted { + thread_id: ThreadId, + session_id: u64, + text_len: usize, // never log raw transcript + held_ms: u64, + finalized_by_watchdog: bool, + }, + // ...future variants +} + +// in DomainEvent +Voice(VoiceEvent), +``` + +Subscribers will be added in the follow-up screen-capture PR (the screen-intelligence domain will hook here to grab a frame when a PTT turn commits). For v1 we publish, nobody subscribes — the test asserts publish reaches a test subscriber. + +#### Chat-send schema — `speak_reply` flag + +The user→agent ingress RPC is **`openhuman.channel_web_chat`** (web channel provider — `src/openhuman/channels/providers/web.rs`, schema in `schemas("chat")`, handler `channel_web_chat`, dispatch through `start_chat`). The frontend already calls this from `app/src/services/chatService.ts::chatSend`. Three additive optional fields: + +```rust +// In the channel.web_chat input schema (web.rs schemas()) +#[serde(default)] +pub speak_reply: Option, +#[serde(default)] +pub source: Option, // "ptt" | "dictation" | "type" | ... +#[serde(default)] +pub session_id: Option, // PTT correlation key +``` + +Non-breaking — all fields `Option`. The flags flow through `channel_web_chat → start_chat → spawn_progress_bridge`. The progress bridge buffers `AgentProgress::TextDelta` chunks during the turn; on `AgentProgress::TurnCompleted`, if `speak_reply == Some(true)`, it calls `voice::reply_speech::synthesize_and_play(buffered_text).await`. This is the **only** Rust-core code path change beyond the schema and the bus event. + +`source` and `session_id` are persisted on the user message metadata (via the message-record path already used by `start_chat`) and included in the `VoiceEvent::PttTranscriptCommitted` bus event for the screen-capture follow-up PR. + +#### `about_app` capability catalog + +Add entry: + +```rust +Capability { + id: "voice.ptt", + label: "Global push-to-talk", + supported_on: &[Platform::MacOS, Platform::Windows, Platform::LinuxX11], + requires: &["microphone", "global_shortcut"], +} +``` + +### Frontend — `app/src/` + +#### `services/pttService.ts` *(new singleton)* + +State machine: + +``` +Idle ──[ptt://start]──▶ Capturing ──[ptt://stop]──▶ Finalizing ──▶ Idle + ▲ │ + │ ├──[10s no stop]──▶ Finalizing (watchdog=true) + │ │ + │ └──[mic-fail / preempt / register]──▶ Aborted ──▶ Idle +``` + +API surface: + +```ts +interface PttService { + init(): void; // subscribes to Tauri ptt://* events + destroy(): void; + // exposed for tests: + onStart(session_id: number): Promise; + onStop(session_id: number): Promise; + cancel(reason: "preempted_by_ptt" | "mic_failure" | "user_cancel"): void; +} +``` + +`onStart` (in order): +1. If a session is already active → call `cancel("preempted_by_ptt")`. +2. `playChime("open")`. +3. `invoke("show_ptt_overlay", { active: true, session_id })`. +4. `voice/audio_capture.start({ session_tag: "ptt:" + session_id })`. +5. `armWatchdog(10_000, () => this.onStop(session_id))`. + +`onStop`: +1. Disarm watchdog. +2. `const buf = await voice/audio_capture.finalize()`. +3. `playChime("close")`. +4. `invoke("show_ptt_overlay", { active: false, session_id })`. +5. If `buf.duration_ms < 250` → drop session, play `"no-speech"` double-click chime, log `dropped_reason: "empty_audio"`, return. +6. `const text = await dictationListener.transcribe(buf)`. +7. If `!text.trim()` → drop, log `dropped_reason: "empty_transcript"`, return. +8. Resolve `activeThreadId`: + - If `chatRuntime.activeThread` exists → use it. + - Else → create a new thread titled `"Voice"` via `openhuman.thread_create`, mark `source: "ptt"`, use its ID. +9. `chatRuntime.sendMessage({ threadId, body: text, metadata: { source: "ptt", session_id }, speakReply: state.ptt.speakReplies })`. +10. Zero the audio buffer. + +`cancel`: +- Disarm watchdog, finalize-and-discard the audio buffer (zero it), hide overlay, play error chime, log with reason. No chat message posted. + +Errors during the session — handled per the table in **§ Error handling** below. + +#### `store/slices/ptt.ts` *(new redux slice)* + +```ts +interface PttState { + shortcut: string | null; // null = unbound (default) + speakReplies: boolean; // default true + showOverlay: boolean; // default true + isHeld: boolean; // not persisted +} +``` + +Persisted (except `isHeld`) via the existing redux-persist config. Re-registers the hotkey on rehydration via a sibling `useEffect` to the existing dictation init. + +#### `pages/settings/voice/PttSettingsPanel.tsx` *(new)* + +- Hotkey-capture widget (same component family as the dictation key picker). +- Toggle: "Speak agent replies" (`speakReplies`). +- Toggle: "Show overlay while held" (`showOverlay`). +- Inline help: "Push-to-talk is off — pick a hotkey to enable." when `shortcut == null`. +- Inline error: surfaces `PttError::ConflictsWithDictation`, `ShortcutInUse`, `AccessibilityRequired` (with a "Open Accessibility settings" button on macOS), `UnsupportedOnWayland`. +- Inline hint: "In exclusive-fullscreen games the overlay won't render — you'll only hear the chime. Switch to borderless fullscreen for the overlay." + +#### `pages/PttOverlayPage.tsx` *(new — rendered only in the overlay window)* + +Borderless 160×56 region: small mic glyph, label ("Listening…"), pulsing red dot when `state.active`. Reads `active` from a local React state updated by a `useEffect` that listens for `show_ptt_overlay`-relayed events. No redux access — the overlay window has its own React root. + +#### `ChatRuntimeProvider` — forward `speak_reply` + +`chatService.chatSend` (already the single call site for `openhuman.channel_web_chat`) accepts `speakReply?: boolean`, `source?: string`, `sessionId?: number` and forwards them as the new optional fields. `ChatRuntimeProvider`'s `sendMessage` plumbs them through from `pttService`. + +#### Chimes + +- `app/src/assets/audio/ptt-open.wav` — short rising tone, ~80ms. +- `app/src/assets/audio/ptt-close.wav` — short falling tone, ~80ms. +- `app/src/assets/audio/ptt-error.wav` — double-click, ~120ms. +- `app/src/assets/audio/README.md` — CC0 attribution. + +LUFS-normalized to roughly match the existing in-app notification sound. Played via a plain `Audio` element from `pttService`. + +#### i18n + +New keys under a `pttSettings` / `pttOverlay` namespace in `app/src/lib/i18n/en.ts`, real translations added to all 13 non-English locale files (`ar`, `bn`, `de`, `es`, `fr`, `hi`, `id`, `it`, `ko`, `pl`, `pt`, `ru`, `zh-CN`). `pnpm i18n:check` and `pnpm i18n:english:check` gate this. + +--- + +## Data flow / sequence diagram + +See the architecture overview above. The key invariants: + +- **No focus stealing.** No window is `show()`-ed with focus; `show_ptt_overlay` shows a `focus: false` window. The agent reply plays via TTS without any window-state mutation. +- **Single mic at a time.** `voice::audio_capture` enforces this. PTT preempts in-flight dictation; dictation cannot start during a PTT session. +- **Session ID is the correlation key.** Logged in shell + renderer + bus event + chat metadata. + +--- + +## Error handling + +| Failure | Behavior | +| --- | --- | +| Mic permission denied (`MicPermissionDenied`) | Error chime, hide overlay, log `[ptt] mic_denied`. Next time the user opens `/settings/voice`, a sticky banner links to OS mic settings. No mid-game modal. | +| Mic stream drops mid-session (USB unplug) | `cancel("mic_failure")`. No chat message posted. | +| STT call fails (network / model timeout) | Post message anyway as `[Voice — transcription failed]` so the user has a breadcrumb. Subsequent agent turn handles it normally. | +| Agent turn errors | Existing chat-error UI. TTS reply just doesn't play. Overlay already hidden by this point. | +| `ptt://stop` never arrives (OS swallowed release) | 10s watchdog finalizes. Session tagged `finalized_by_watchdog: true`. Logged at `warn`. | +| App backgrounded during hold | Hotkey still fires (global). Overlay still shows. Chime still plays. By design. | +| Empty / sub-threshold audio (< 250ms) | Drop session, play `no-speech` chime, log `dropped_reason: "empty_audio"`. No message posted. | +| Empty transcript (STT returned blank) | Same as above with `dropped_reason: "empty_transcript"`. | +| Shortcut conflict with dictation | Registration returns `ConflictsWithDictation`. Settings panel shows the inline error. | +| Wayland session | `UnsupportedOnWayland`. Settings panel surfaces a clear message. Logged once per session. | + +**Logging** (per the debug-logging rule): all logs use `[ptt]` prefix. Fields per session: `session_id`, `shortcut`, `held_ms`, `transcript_len`, `dropped_reason`, `finalized_by_watchdog`. PII-safe — never log transcript text or audio buffers, only lengths/durations. Audio buffers are zeroed after finalize. + +**Telemetry**: one new analytics event `ptt_session` mirroring the log fields (no transcript), gated by the existing analytics opt-in. + +--- + +## Configuration + +- **No `Config` TOML schema change.** All PTT settings live in the renderer's `ptt` redux slice (persisted), mirroring how dictation is configured today. +- **Default `shortcut: null`** (unbound). No hard-coded default key — every possible default conflicts with something common. +- **Default `speakReplies: true`**, **`showOverlay: true`**. +- **Boot path:** on rehydration, if `state.ptt.shortcut` is non-null, call `register_ptt_hotkey`. On settings change, unregister-then-register. Independent of the existing dictation init. + +--- + +## Migration + +Brand-new state. No migration. Existing users on `0.53.45+` see the new `/settings/voice` PTT section after upgrade with everything default-off until they bind a key. + +--- + +## Testing + +| Layer | What | Where | +| --- | --- | --- | +| Rust unit | `expand_ptt_shortcuts`: empty, modifier-only, valid combos, `CmdOrCtrl` expansion (dual-variant on macOS, single on Win/Linux) | `app/src-tauri/src/ptt_hotkeys.rs` inline `#[cfg(test)]` | +| Rust unit | `speak_reply` / `source` / `session_id` round-trip through `channel.web_chat` schema serde; default behavior unchanged when all omitted | `src/openhuman/channels/providers/web_tests.rs` | +| Rust unit | `DomainEvent::Voice::PttTranscriptCommitted` publishes; test subscriber receives it | `src/openhuman/voice/bus.rs` inline tests | +| Rust E2E | `tests/json_rpc_e2e.rs` — call `channel.web_chat` with `speak_reply: true` and assert `reply_speech::synthesize_and_play` is invoked via a test seam at the progress-bridge's `TurnCompleted` boundary | `tests/json_rpc_e2e.rs` extension | +| Vitest unit | `pttService` state machine: start→stop happy path, watchdog timeout, empty-audio drop, empty-transcript drop, dictation-preempt, double-press idempotency, mic-permission-denied path | `app/src/services/pttService.test.ts` (new) | +| Vitest unit | `ptt` redux slice: shortcut set/clear, toggle settings, rehydration | `app/src/store/slices/ptt.test.ts` (new) | +| Vitest unit | `PttSettingsPanel` — render, hotkey capture, conflict-with-dictation error, mic-denied banner, Wayland banner | `app/src/pages/settings/voice/PttSettingsPanel.test.tsx` (new) | +| Vitest unit | `PttOverlayPage` — renders idle vs active states, listens for active event | `app/src/pages/PttOverlayPage.test.tsx` (new) | +| i18n gate | All new keys present in all 13 locales, no untranslated English values | `pnpm i18n:check` + `pnpm i18n:english:check` (existing CI) | +| WDIO E2E | Desktop spec: register a hotkey via settings UI, simulate the hotkey via `tauri-driver` key injection, assert overlay window appears, assert chat thread receives a message. STT mocked via the shared mock backend returning a fixed transcript. | `app/test/e2e/specs/ptt-flow.spec.ts` (new) | +| Manual smoke | Hold-while-game-in-foreground on macOS + Windows; mic permission denied flow; Wayland fallback message | PR body checklist | + +**Coverage gate.** Every changed line in the new files + the `channel.web_chat` schema delta ships with ≥ 80% diff coverage per the existing merge gate. Untested escape valves (the real `Audio.play()` call, the real `tauri-driver` key injection) are isolated behind thin wrappers that can be mocked. + +--- + +## Out of scope (named explicitly) + +- **Background screen capture for the agent** — separate follow-up PR off the same issue. +- **PTT-while-dictation-mid-flight** polish beyond "preempt with reason." +- **DXGI exclusive-fullscreen overlay rendering** — documented caveat only. +- **Streaming partial transcripts during hold.** +- **Per-thread PTT routing** (v1 always uses active thread; if none, creates a `"Voice"` thread). +- **Native platform overlays** (NSWindow / Win32 layered / X11 override-redirect) — Tauri overlay window covers v1 needs. +- **PTT toggle-mode** — out; dictation toggle covers that pattern already. + +--- + +## Open questions + +None at spec time. If implementation surfaces blockers (e.g. `tauri-plugin-global-shortcut` `Released` semantics regress on a specific OS version), revisit with a small spec amendment rather than a silent design drift. diff --git a/gitbooks/developing/architecture/tauri-shell.md b/gitbooks/developing/architecture/tauri-shell.md index b4f3d0a756..48655f141e 100644 --- a/gitbooks/developing/architecture/tauri-shell.md +++ b/gitbooks/developing/architecture/tauri-shell.md @@ -158,6 +158,20 @@ From **`workspace_paths.rs`** (closes `#1402`). These commands accept workspace- | `reveal_workspace_path` | Reveal an existing workspace file or directory in the OS file manager. | | `preview_workspace_text` | Read a capped UTF-8 text preview from an existing workspace file. | +### Push-to-talk (PTT) hotkey + overlay + +Registered in **`lib.rs`** (`ptt_hotkeys.rs` + `ptt_overlay.rs`). These commands manage the global push-to-talk shortcut and the floating overlay window. + +| Command | Signature | Purpose | +| ---------------------- | ---------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `register_ptt_hotkey` | `(shortcut: String) -> Result<(), String>` | Register (or re-register) a global hotkey for push-to-talk. Emits Tauri events `ptt://start { session_id }` (key pressed) and `ptt://stop { session_id }` (key released). Returns an error string if the shortcut conflicts with dictation or if the OS rejects it (e.g. Wayland, Accessibility permission required on macOS). | +| `unregister_ptt_hotkey`| `() -> Result<(), String>` | Unregister the current PTT hotkey and tear down the overlay window. | +| `show_ptt_overlay` | `(active: bool, session_id: u64) -> ()` | Show (`active: true`) or hide (`active: false`) the floating PTT overlay window. The window is focus-stealing-free (`focus: false`). Called by `PttHotkeyManager.tsx` via `app/src/utils/tauriCommands/ptt.ts`. | + +**Event flow:** `register_ptt_hotkey` wires the OS hotkey to fire `ptt://start` / `ptt://stop` Tauri events that `PttHotkeyManager.tsx` subscribes to via `@tauri-apps/api/event`. The manager forwards them into the `pttService` state machine which drives the audio capture → transcribe → chat-send pipeline. + +**Conflict detection:** `register_ptt_hotkey` checks for overlap with the active dictation shortcuts before registering. If a conflict is detected it returns `"ConflictsWithDictation()"` without registering anything, and the settings panel surfaces this as `pttSettings.errorConflictsWithDictation`. + ### Synthetic input main-thread executor (native registry, not `invoke`) Registered in **`lib.rs`** at startup under the event-bus native-request method diff --git a/src/core/event_bus/events.rs b/src/core/event_bus/events.rs index 6539ee6442..1279d8bd53 100644 --- a/src/core/event_bus/events.rs +++ b/src/core/event_bus/events.rs @@ -25,6 +25,21 @@ //! - [`DomainEvent::ChannelMessageReceived`] //! - [`DomainEvent::ChannelMessageProcessed`] +/// Voice-domain events. +#[non_exhaustive] +#[derive(Clone, Debug)] +pub enum VoiceEvent { + /// A PTT session committed a transcript to a thread. Carries only + /// length/timing — never the raw text, per the PII-safe logging rule. + PttTranscriptCommitted { + thread_id: String, + session_id: u64, + text_len: usize, + held_ms: u64, + finalized_by_watchdog: bool, + }, +} + /// Top-level domain event. Non-exhaustive so new variants can be added /// without breaking existing match arms. #[non_exhaustive] @@ -896,6 +911,10 @@ pub enum DomainEvent { /// never to Sentry or the UI verbatim. SessionExpired { source: String, reason: String }, + // ── Voice ──────────────────────────────────────────────────────────── + /// A voice domain event (PTT, transcription lifecycle, etc.). + Voice(VoiceEvent), + // ── Task sources ───────────────────────────────────────────────────── /// A task source completed a fetch pass. TaskSourceFetched { @@ -1079,6 +1098,8 @@ impl DomainEvent { Self::TaskPlanAwaitingApproval { .. } | Self::TaskRunReclaimed { .. } => "agent", + Self::Voice(_) => "voice", + Self::ApprovalRequested { .. } | Self::ApprovalDecided { .. } | Self::ApprovalGateOverrideIgnored { .. } @@ -1214,6 +1235,7 @@ impl DomainEvent { Self::BackendMeetHarness { .. } => "BackendMeetHarness", Self::BackendMeetTranscript { .. } => "BackendMeetTranscript", Self::BackendMeetError { .. } => "BackendMeetError", + Self::Voice(_) => "Voice", } } diff --git a/src/core/event_bus/mod.rs b/src/core/event_bus/mod.rs index 6376311057..f8446b16ec 100644 --- a/src/core/event_bus/mod.rs +++ b/src/core/event_bus/mod.rs @@ -61,7 +61,7 @@ pub mod testing; mod tracing; pub use bus::{global, init_global, publish_global, subscribe_global, EventBus, DEFAULT_CAPACITY}; -pub use events::{BackendMeetTurn, DomainEvent}; +pub use events::{BackendMeetTurn, DomainEvent, VoiceEvent}; pub use native_request::{ init_native_registry, native_registry, register_native_global, request_native_global, NativeRegistry, NativeRequestError, diff --git a/src/core/socketio.rs b/src/core/socketio.rs index 9446547f54..0a6fca4024 100644 --- a/src/core/socketio.rs +++ b/src/core/socketio.rs @@ -438,6 +438,7 @@ pub fn attach_socketio() -> (socketioxide::layer::SocketIoLayer, SocketIo) { payload.profile_id, payload.locale, payload.queue_mode, + crate::openhuman::channels::providers::web::ChatRequestMetadata::default(), ) .await { diff --git a/src/openhuman/about_app/catalog_data.rs b/src/openhuman/about_app/catalog_data.rs index 2ac0dd99de..4b661ba719 100644 --- a/src/openhuman/about_app/catalog_data.rs +++ b/src/openhuman/about_app/catalog_data.rs @@ -147,6 +147,22 @@ pub(super) const CAPABILITIES: &[Capability] = &[ status: CapabilityStatus::Beta, privacy: DERIVED_TO_BACKEND, }, + Capability { + id: "voice.ptt", + name: "Global push-to-talk", + domain: "voice", + category: CapabilityCategory::Conversation, + description: "Hold a global hotkey from anywhere on the desktop to dictate into the \ + active chat thread. Press opens the mic, release commits the transcript, \ + and an always-on-top overlay shows listening/idle state without stealing \ + focus. Cross-platform via tauri-plugin-global-shortcut (macOS, Windows, \ + Linux/X11); requires microphone access and a global shortcut binding. \ + Optional speak_reply plays the agent's response through local TTS.", + how_to: "Settings → Voice → Push-to-Talk: pick a shortcut, grant microphone access, \ + then hold the configured hotkey from any window.", + status: CapabilityStatus::Beta, + privacy: DERIVED_TO_BACKEND, + }, Capability { id: "conversation.inline_autocomplete", name: "Inline Autocomplete", diff --git a/src/openhuman/about_app/catalog_tests.rs b/src/openhuman/about_app/catalog_tests.rs index 8a1477f660..1021ef57a2 100644 --- a/src/openhuman/about_app/catalog_tests.rs +++ b/src/openhuman/about_app/catalog_tests.rs @@ -7,6 +7,36 @@ fn lookup_returns_expected_capability() { assert_eq!(capability.status, CapabilityStatus::Beta); } +/// PR #3090: the global push-to-talk feature is user-facing and must be +/// discoverable in the capability catalog so the in-app /about surface and +/// settings search can describe it. Pins the id, category, and the rough +/// shape of the how_to / description so a future rewrite can't silently +/// drop the entry or split it from the Conversation umbrella where the +/// related voice capabilities live. +#[test] +fn capability_list_includes_voice_ptt() { + let caps = all_capabilities(); + assert!( + caps.iter().any(|c| c.id == "voice.ptt"), + "voice.ptt capability must be registered" + ); + + let ptt = lookup("voice.ptt").expect("voice.ptt should be registered"); + assert_eq!(ptt.category, CapabilityCategory::Conversation); + assert_eq!(ptt.domain, "voice"); + assert!( + ptt.how_to.contains("Push-to-Talk") || ptt.how_to.contains("push-to-talk"), + "how_to must mention Push-to-Talk, got: {}", + ptt.how_to + ); + assert!( + ptt.description.to_lowercase().contains("hold") + && ptt.description.to_lowercase().contains("hotkey"), + "description must describe the hold-to-talk hotkey behaviour, got: {}", + ptt.description + ); +} + #[test] fn composio_direct_mode_capabilities_are_registered() { // PR #1710 PR3: ensure the direct-mode capability and the trigger-gap diff --git a/src/openhuman/agent/task_dispatcher.rs b/src/openhuman/agent/task_dispatcher.rs index 91279a1b9a..d6c2f73bb1 100644 --- a/src/openhuman/agent/task_dispatcher.rs +++ b/src/openhuman/agent/task_dispatcher.rs @@ -591,6 +591,7 @@ async fn run_autonomous( thread_id.to_string(), run_id.to_string(), crate::openhuman::threads::turn_state::TurnStateStore::new(workspace_dir.clone()), + crate::openhuman::channels::providers::web::ChatRequestMetadata::default(), config.clone(), ); } diff --git a/src/openhuman/channels/bus.rs b/src/openhuman/channels/bus.rs index 613994c616..62fb87a996 100644 --- a/src/openhuman/channels/bus.rs +++ b/src/openhuman/channels/bus.rs @@ -86,7 +86,15 @@ impl EventHandler for ChannelInboundSubscriber { crate::openhuman::channels::providers::web::subscribe_web_channel_events(); let request_id = match crate::openhuman::channels::providers::web::start_chat( - &client_id, &thread_id, message, None, None, None, None, None, + &client_id, + &thread_id, + message, + None, + None, + None, + None, + None, + crate::openhuman::channels::providers::web::ChatRequestMetadata::default(), ) .await { diff --git a/src/openhuman/channels/providers/web.rs b/src/openhuman/channels/providers/web.rs index 13d9191464..d2563dd7d2 100644 --- a/src/openhuman/channels/providers/web.rs +++ b/src/openhuman/channels/providers/web.rs @@ -483,6 +483,7 @@ pub async fn start_chat( profile_id: Option, locale: Option, queue_mode: Option, + metadata: ChatRequestMetadata, ) -> Result { let client_id = client_id.trim().to_string(); let thread_id = thread_id.trim().to_string(); @@ -732,6 +733,7 @@ pub async fn start_chat( profile_id, locale, turn_run_queue_task, + metadata, ), ), ) @@ -901,6 +903,7 @@ fn dispatch_followups(followups: Vec, locale: Option, run_queue: Arc, + metadata: ChatRequestMetadata, ) -> Result { #[cfg(any(test, debug_assertions))] { @@ -1169,6 +1173,7 @@ async fn run_chat_task( thread_id.to_string(), request_id.to_string(), turn_state_store, + metadata.clone(), config.clone(), ); @@ -1209,6 +1214,64 @@ async fn run_chat_task( } }; + // Voice / PTT integration (#3090 Task 4). When the chat was sent with + // `speak_reply: true`, drive the agent's full reply through + // `voice::reply_speech::synthesize_reply` so the renderer can play it. + // When the call originated as a PTT session, also publish + // `PttTranscriptCommitted` so screen-intelligence (and any future bus + // subscriber) can react to a completed PTT turn. + // + // Why here (not in the progress bridge): the bridge sees `TextDelta`s + // only when the inference provider streams. The non-streaming fallback + // (and the JSON-RPC E2E mocks) produce a single final response with no + // deltas — so buffering deltas alone loses the reply text in those + // paths. The full response is available right here, regardless of + // streaming mode, which makes this the most reliable hook point. + // + // Failures are non-fatal (TTS / observability are best-effort side + // channels). + if let Ok(ref task_result) = result { + let speak_reply = matches!(metadata.speak_reply, Some(true)); + let trimmed_response = task_result.full_response.trim(); + if speak_reply && !trimmed_response.is_empty() { + let opts = crate::openhuman::voice::reply_speech::ReplySpeechOptions::default(); + match crate::openhuman::voice::reply_speech::synthesize_reply( + &config, + &task_result.full_response, + &opts, + ) + .await + { + Ok(_) => log::debug!( + "[web_channel] reply_speech dispatched chars={} client_id={} thread_id={} request_id={}", + task_result.full_response.len(), + client_id, + thread_id, + request_id, + ), + Err(err) => log::warn!( + "[web_channel] reply_speech failed: {err} client_id={} thread_id={} request_id={}", + client_id, + thread_id, + request_id, + ), + } + } + if metadata.source.as_deref() == Some("ptt") { + if let Some(session_id) = metadata.session_id { + // TODO(#3090 T11): held_ms will be supplied by the renderer once the PTT + // watchdog reports actual hold duration. 0 is a placeholder until then. + crate::openhuman::voice::publish_ptt_transcript_committed( + thread_id.to_string(), + session_id, + task_result.full_response.chars().count(), + 0, + false, + ); + } + } + } + // Clear the sender so it doesn't hold the channel open across sessions. agent.set_on_progress(None); @@ -1230,12 +1293,21 @@ async fn run_chat_task( /// agent turn loop and translates them into [`WebChannelEvent`]s tagged /// with the correct client/thread/request IDs. The task runs until the /// sender is dropped (i.e. when the agent turn finishes). +/// +/// `metadata` is logged on the bridge's diagnostic lines so PTT turns are +/// easy to correlate across the stream of progress events. The +/// authoritative TTS / PTT-commit dispatch (`speak_reply` → +/// `voice::reply_speech::synthesize_reply`, `source == "ptt"` → +/// `publish_ptt_transcript_committed`) is owned by `run_chat_task`, which +/// sees the full assistant response even when the provider falls back to +/// non-streaming. pub(crate) fn spawn_progress_bridge( mut rx: tokio::sync::mpsc::Receiver, client_id: String, thread_id: String, request_id: String, turn_state_store: TurnStateStore, + metadata: ChatRequestMetadata, config: crate::openhuman::config::Config, ) { use crate::openhuman::agent::progress::AgentProgress; @@ -1246,10 +1318,13 @@ pub(crate) fn spawn_progress_bridge( tokio::spawn(async move { log::debug!( - "[web_channel][bridge] spawned client_id={} thread_id={} request_id={}", + "[web_channel][bridge] spawned client_id={} thread_id={} request_id={} speak_reply={:?} source={:?} session_id={:?}", client_id, thread_id, request_id, + metadata.speak_reply, + metadata.source, + metadata.session_id, ); let mut round: u32 = 0; let mut events_seen: u64 = 0; @@ -2104,7 +2179,11 @@ pub(crate) fn spawn_progress_bridge( ); log::debug!( "[web_channel] turn completed after {iterations} iteration(s) \ - client_id={client_id} thread_id={thread_id} request_id={request_id}" + client_id={client_id} thread_id={thread_id} request_id={request_id} \ + speak_reply={:?} source={:?} session_id={:?}", + metadata.speak_reply, + metadata.source, + metadata.session_id, ); } AgentProgress::TurnCostUpdated { @@ -2389,11 +2468,34 @@ struct WebChatParams { /// default language (English) so existing integrations don't /// silently change behaviour. locale: Option, + /// When `true`, the agent's final reply should be spoken via TTS + /// (for PTT and similar background voice flows). Accepted and + /// stored here; wired to TTS in Task 4. + #[serde(default)] + speak_reply: Option, + /// Origin of the message: `"ptt"` | `"dictation"` | `"type"` | other. + /// Used for analytics and downstream metadata. + #[serde(default)] + source: Option, + /// Optional caller-provided correlation id (PTT session id). + #[serde(default)] + session_id: Option, /// Queue mode for concurrent messages: `interrupt` (default), `steer`, /// `followup`, or `collect`. + #[serde(default)] queue_mode: Option, } +/// Per-request metadata carried alongside a chat send. Currently used by the +/// PTT flow (Task 4 wires it to `voice::reply_speech`); other voice surfaces +/// can populate it the same way. +#[derive(Debug, Default, Clone)] +pub struct ChatRequestMetadata { + pub speak_reply: Option, + pub source: Option, + pub session_id: Option, +} + #[derive(Debug, Deserialize)] struct WebQueueParams { thread_id: String, @@ -2414,6 +2516,7 @@ pub async fn channel_web_chat( profile_id: Option, locale: Option, queue_mode: Option, + metadata: ChatRequestMetadata, ) -> Result, String> { let result = start_chat( client_id, @@ -2424,6 +2527,7 @@ pub async fn channel_web_chat( profile_id, locale, queue_mode, + metadata, ) .await?; @@ -2580,6 +2684,9 @@ pub fn schemas(function: &str) -> ControllerSchema { "locale", "Optional BCP-47 UI locale (e.g. 'ar', 'zh-CN'). Drives the \"reply in this language\" system-prompt directive.", ), + optional_bool("speak_reply", "When true, the agent's final reply is spoken via TTS (for PTT and similar background voice flows)."), + optional_string("source", "Origin of the message: \"ptt\" | \"dictation\" | \"type\" | other. Used for analytics + downstream metadata."), + optional_u64("session_id", "Optional caller-provided correlation id (PTT session id)."), optional_string( "queue_mode", "Queue mode: 'interrupt' (default), 'steer', 'followup', or 'collect'.", @@ -2639,6 +2746,11 @@ fn handle_chat(params: Map) -> ControllerFuture { p.profile_id, p.locale, p.queue_mode, + ChatRequestMetadata { + speak_reply: p.speak_reply, + source: p.source, + session_id: p.session_id, + }, ) .await?, ) @@ -2746,6 +2858,24 @@ fn optional_f64(name: &'static str, comment: &'static str) -> FieldSchema { } } +fn optional_bool(name: &'static str, comment: &'static str) -> FieldSchema { + FieldSchema { + name, + ty: TypeSchema::Option(Box::new(TypeSchema::Bool)), + comment, + required: false, + } +} + +fn optional_u64(name: &'static str, comment: &'static str) -> FieldSchema { + FieldSchema { + name, + ty: TypeSchema::Option(Box::new(TypeSchema::U64)), + comment, + required: false, + } +} + fn json_output(name: &'static str, comment: &'static str) -> FieldSchema { FieldSchema { name, diff --git a/src/openhuman/channels/providers/web_tests.rs b/src/openhuman/channels/providers/web_tests.rs index d1f087b780..82423cc506 100644 --- a/src/openhuman/channels/providers/web_tests.rs +++ b/src/openhuman/channels/providers/web_tests.rs @@ -3,9 +3,10 @@ use super::{ classify_inference_error, compose_system_prompt_suffix, event_session_id_for, extract_provider_error_detail, generic_inference_error_user_message, inference_budget_exceeded_user_message, is_inference_budget_exceeded_error, json_output, - key_for, locale_reply_directive, normalize_model_override, optional_f64, optional_string, - provider_role_for_model_override, required_string, schemas, - set_test_forced_run_chat_task_error, start_chat, subscribe_web_channel_events, ClassifiedError, + key_for, locale_reply_directive, normalize_model_override, optional_bool, optional_f64, + optional_string, optional_u64, provider_role_for_model_override, required_string, schemas, + set_test_forced_run_chat_task_error, start_chat, subscribe_web_channel_events, + ChatRequestMetadata, ClassifiedError, WebChatParams, }; use crate::core::TypeSchema; use once_cell::sync::Lazy; @@ -24,19 +25,49 @@ static FORCED_ERROR_TEST_LOCK: Lazy> = Lazy::new(|| TokioMutex::n #[tokio::test] async fn start_chat_validates_required_fields() { - let err = start_chat("", "thread", "hello", None, None, None, None, None) - .await - .expect_err("client id should be required"); + let err = start_chat( + "", + "thread", + "hello", + None, + None, + None, + None, + None, + ChatRequestMetadata::default(), + ) + .await + .expect_err("client id should be required"); assert!(err.contains("client_id is required")); - let err = start_chat("client", "", "hello", None, None, None, None, None) - .await - .expect_err("thread id should be required"); + let err = start_chat( + "client", + "", + "hello", + None, + None, + None, + None, + None, + ChatRequestMetadata::default(), + ) + .await + .expect_err("thread id should be required"); assert!(err.contains("thread_id is required")); - let err = start_chat("client", "thread", " ", None, None, None, None, None) - .await - .expect_err("message should be required"); + let err = start_chat( + "client", + "thread", + " ", + None, + None, + None, + None, + None, + ChatRequestMetadata::default(), + ) + .await + .expect_err("message should be required"); assert!(err.contains("message is required")); } @@ -51,6 +82,7 @@ async fn start_chat_rejects_prompt_injection_payload() { None, None, None, + ChatRequestMetadata::default(), ) .await .expect_err("prompt-injection payload should be rejected"); @@ -94,6 +126,7 @@ async fn start_chat_emits_sanitized_chat_error_on_inference_failure() { None, None, None, + ChatRequestMetadata::default(), ) .await .expect("start_chat should accept valid request"); @@ -505,6 +538,7 @@ async fn start_chat_chat_error_event_serializes_structured_fields_to_json_wire() None, None, None, + ChatRequestMetadata::default(), ) .await .expect("start_chat should accept valid request"); @@ -599,6 +633,7 @@ async fn start_chat_emits_structured_rate_limit_metadata_on_chat_error_event() { None, None, None, + ChatRequestMetadata::default(), ) .await .expect("start_chat should accept valid request"); @@ -1315,3 +1350,89 @@ fn compose_system_prompt_suffix_combines_locale_and_profile() { // Both absent → None preserves the agent's vanilla prompt. assert!(compose_system_prompt_suffix(None, None).is_none()); } + +// ── PTT field additions (Task 1 of global-ptt plan) ───────────────────────── + +#[test] +fn web_chat_schema_accepts_optional_ptt_fields() { + // Locate the `chat` schema via the public accessor. + let schema = schemas("chat"); + let names: std::collections::HashSet<&str> = schema.inputs.iter().map(|f| f.name).collect(); + assert!( + names.contains("speak_reply"), + "channel.web_chat schema must include optional speak_reply field" + ); + assert!( + names.contains("source"), + "channel.web_chat schema must include optional source field" + ); + assert!( + names.contains("session_id"), + "channel.web_chat schema must include optional session_id field" + ); + // All three are optional. + for field in &["speak_reply", "source", "session_id"] { + let f = schema + .inputs + .iter() + .find(|f| f.name == *field) + .expect("field present"); + assert!(!f.required, "{field} must be optional"); + } + // Type assertions: ensure each field has the correct wire type. + let speak_reply = schema + .inputs + .iter() + .find(|f| f.name == "speak_reply") + .unwrap(); + assert_eq!( + speak_reply.ty, + TypeSchema::Option(Box::new(TypeSchema::Bool)), + "speak_reply must be Option" + ); + let source = schema.inputs.iter().find(|f| f.name == "source").unwrap(); + assert_eq!( + source.ty, + TypeSchema::Option(Box::new(TypeSchema::String)), + "source must be Option" + ); + let session_id = schema + .inputs + .iter() + .find(|f| f.name == "session_id") + .unwrap(); + assert_eq!( + session_id.ty, + TypeSchema::Option(Box::new(TypeSchema::U64)), + "session_id must be Option" + ); +} + +#[test] +fn web_chat_params_deserialize_with_all_ptt_fields_omitted() { + let json = serde_json::json!({ + "client_id": "c1", + "thread_id": "t1", + "message": "hello", + }); + let parsed: WebChatParams = serde_json::from_value(json).unwrap(); + assert_eq!(parsed.speak_reply, None); + assert_eq!(parsed.source, None); + assert_eq!(parsed.session_id, None); +} + +#[test] +fn web_chat_params_deserialize_with_all_ptt_fields_present() { + let json = serde_json::json!({ + "client_id": "c1", + "thread_id": "t1", + "message": "hello", + "speak_reply": true, + "source": "ptt", + "session_id": 42_u64, + }); + let parsed: WebChatParams = serde_json::from_value(json).unwrap(); + assert_eq!(parsed.speak_reply, Some(true)); + assert_eq!(parsed.source.as_deref(), Some("ptt")); + assert_eq!(parsed.session_id, Some(42)); +} diff --git a/src/openhuman/voice/bus.rs b/src/openhuman/voice/bus.rs new file mode 100644 index 0000000000..bf6bc7a669 --- /dev/null +++ b/src/openhuman/voice/bus.rs @@ -0,0 +1,89 @@ +//! Voice domain event publishers. The PTT transcript-committed event is +//! published here so the future screen-intelligence follow-up can subscribe +//! and grab a frame on commit without coupling to the channel-web flow. + +use crate::core::event_bus::{publish_global, DomainEvent, VoiceEvent}; + +/// Publish a [`VoiceEvent::PttTranscriptCommitted`] event. +pub fn publish_ptt_transcript_committed( + thread_id: String, + session_id: u64, + text_len: usize, + held_ms: u64, + finalized_by_watchdog: bool, +) { + publish_global(DomainEvent::Voice(VoiceEvent::PttTranscriptCommitted { + thread_id, + session_id, + text_len, + held_ms, + finalized_by_watchdog, + })); +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::core::event_bus::{init_global, subscribe_global, DomainEvent, EventHandler}; + use async_trait::async_trait; + use std::sync::Arc; + use tokio::sync::Mutex as AsyncMutex; + + #[derive(Default)] + struct Capture { + events: Arc>>, + } + + #[async_trait] + impl EventHandler for Capture { + fn name(&self) -> &str { + "voice::ptt_test_capture" + } + + fn domains(&self) -> Option<&[&str]> { + Some(&["voice"]) + } + + async fn handle(&self, event: &DomainEvent) { + if let DomainEvent::Voice(v) = event { + self.events.lock().await.push(v.clone()); + } + } + } + + #[tokio::test] + async fn publishing_a_ptt_commit_reaches_a_subscriber() { + // Use the singleton (init is idempotent). + let _ = init_global(64); + let capture = Capture::default(); + let events = capture.events.clone(); + let _sub = subscribe_global(Arc::new(capture)); + + publish_ptt_transcript_committed("thread-1".to_string(), 42, 17, 850, false); + + // Give the broadcaster a tick to deliver. + tokio::time::sleep(std::time::Duration::from_millis(50)).await; + + let got = events.lock().await; + let found = got.iter().find_map(|e| match e { + VoiceEvent::PttTranscriptCommitted { + thread_id, + session_id, + text_len, + held_ms, + finalized_by_watchdog, + } => Some(( + thread_id.clone(), + *session_id, + *text_len, + *held_ms, + *finalized_by_watchdog, + )), + }); + assert_eq!( + found, + Some(("thread-1".to_string(), 42, 17, 850, false)), + "expected the published event to round-trip with all five fields; got events: {got:?}", + ); + } +} diff --git a/src/openhuman/voice/mod.rs b/src/openhuman/voice/mod.rs index d439c2712d..14c1c218ab 100644 --- a/src/openhuman/voice/mod.rs +++ b/src/openhuman/voice/mod.rs @@ -11,6 +11,8 @@ pub mod always_on; pub mod audio_capture; +pub mod bus; +pub use bus::publish_ptt_transcript_committed; pub(crate) mod cli; pub mod command_router; pub mod dictation_listener; diff --git a/src/openhuman/voice/reply_speech.rs b/src/openhuman/voice/reply_speech.rs index b7383f479a..e0cc530357 100644 --- a/src/openhuman/voice/reply_speech.rs +++ b/src/openhuman/voice/reply_speech.rs @@ -15,7 +15,7 @@ //! in a `Tool` impl, the `external_effect()` method MUST stay `false` //! (the trait's default) so the approval gate never prompts on TTS. -use log::debug; +use log::{debug, warn}; use reqwest::Method; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; @@ -28,6 +28,51 @@ use crate::rpc::RpcOutcome; const LOG_PREFIX: &str = "[voice_reply]"; +/// Env var that activates the [`test_seam`] short-circuit at runtime. When +/// set to `1` / `true`, [`synthesize_reply`] records the requested text +/// into [`test_seam::OBSERVED_CALLS`] and returns a stub +/// [`ReplySpeechResult`] *without* contacting the hosted backend. Anything +/// else (unset, `0`, `false`, …) leaves the production code path +/// untouched. +/// +/// The env-var gate (rather than a `#[cfg(test)]` gate) is deliberate: +/// integration tests in `tests/` are compiled against the production +/// `openhuman_core` crate, so a unit-only `cfg(test)` block would not be +/// visible from there. The observer module itself is always compiled, +/// but its only producer is this env-gated branch and its only consumer +/// is the test harness, so production callers never touch it. +pub const TEST_SEAM_ENV: &str = "OPENHUMAN_TEST_REPLY_SPEECH_SEAM"; + +fn test_seam_enabled() -> bool { + matches!( + std::env::var(TEST_SEAM_ENV).ok().as_deref(), + Some("1") | Some("true") | Some("TRUE") + ) +} + +/// Test seam observation log. See [`TEST_SEAM_ENV`] for the activation +/// gate. Always compiled (the visibility lets `tests/json_rpc_e2e.rs` +/// inspect calls), but only written to when the env gate is on. +pub mod test_seam { + use once_cell::sync::Lazy; + use std::sync::Mutex; + + /// FIFO log of every `text` argument that flowed through the test-seam + /// short-circuit in [`super::synthesize_reply`]. Cleared between tests + /// with [`clear`]. + pub static OBSERVED_CALLS: Lazy>> = Lazy::new(|| Mutex::new(Vec::new())); + + /// Clear the observation log. + pub fn clear() { + OBSERVED_CALLS.lock().unwrap().clear(); + } + + /// Snapshot of the observation log. + pub fn observed() -> Vec { + OBSERVED_CALLS.lock().unwrap().clone() + } +} + /// One frame on the viseme timeline. `viseme` is an Oculus / Microsoft /// 15-set code (`sil, PP, FF, TH, DD, kk, CH, SS, nn, RR, aa, E, I, O, U`). #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] @@ -87,6 +132,31 @@ pub async fn synthesize_reply( return Err("text is required".to_string()); } + // Test seam: when OPENHUMAN_TEST_REPLY_SPEECH_SEAM is set (and only in + // debug builds — the seam is structurally dead in release), record the + // call and short-circuit before hitting the backend. + // See `test_seam` module docs and `TEST_SEAM_ENV` for the activation gate. + if cfg!(debug_assertions) && test_seam_enabled() { + warn!( + "[voice_reply] TEST SEAM ACTIVE — synthesize_reply short-circuited ({} is set); skipping backend call", + TEST_SEAM_ENV + ); + let _ = (config, opts); + test_seam::OBSERVED_CALLS + .lock() + .unwrap() + .push(trimmed.to_string()); + return Ok(RpcOutcome::single_log( + ReplySpeechResult { + audio_base64: String::new(), + audio_mime: "audio/mpeg".to_string(), + visemes: Vec::new(), + alignment: None, + }, + "voice reply synthesized (test seam short-circuit)", + )); + } + let token = get_session_token(config) .map_err(|e| e.to_string())? .and_then(|t| { diff --git a/tests/channels_large_round25_raw_coverage_e2e.rs b/tests/channels_large_round25_raw_coverage_e2e.rs index 4ffdf20978..11632f99bf 100644 --- a/tests/channels_large_round25_raw_coverage_e2e.rs +++ b/tests/channels_large_round25_raw_coverage_e2e.rs @@ -113,12 +113,20 @@ async fn web_channel_validation_cancellation_and_error_events_are_observable() { json!({"client_id": "client-1", "thread_id": "thread-1"}) ); - assert!( - web::start_chat(" ", "thread", "hello", None, None, None, None, None) - .await - .unwrap_err() - .contains("client_id is required") - ); + assert!(web::start_chat( + " ", + "thread", + "hello", + None, + None, + None, + None, + None, + web::ChatRequestMetadata::default() + ) + .await + .unwrap_err() + .contains("client_id is required")); assert!(web::cancel_chat("client", " ") .await .unwrap_err() @@ -138,6 +146,7 @@ async fn web_channel_validation_cancellation_and_error_events_are_observable() { None, Some("en-US".to_string()), None, + web::ChatRequestMetadata::default(), ) .await .expect("start forced-error chat"); diff --git a/tests/channels_provider_deep_raw_coverage_e2e.rs b/tests/channels_provider_deep_raw_coverage_e2e.rs index 5e0af50096..4133317a9c 100644 --- a/tests/channels_provider_deep_raw_coverage_e2e.rs +++ b/tests/channels_provider_deep_raw_coverage_e2e.rs @@ -9,7 +9,7 @@ use axum::{ Router, }; use openhuman_core::openhuman::channels::providers::web::{ - cancel_chat, start_chat, subscribe_web_channel_events, + cancel_chat, start_chat, subscribe_web_channel_events, ChatRequestMetadata, }; use openhuman_core::openhuman::channels::providers::yuanbao::{YuanbaoChannel, YuanbaoConfig}; use openhuman_core::openhuman::channels::test_support::{ @@ -229,24 +229,48 @@ async fn dispatch_harness_covers_error_context_compaction_and_timeout_paths() { #[tokio::test] async fn web_channel_validation_cancel_and_classifier_snapshots_are_publicly_exercised() { - assert!( - start_chat("", "thread", "hello", None, None, None, None, None) - .await - .expect_err("empty client rejected") - .contains("client_id") - ); - assert!( - start_chat("client", "", "hello", None, None, None, None, None) - .await - .expect_err("empty thread rejected") - .contains("thread_id") - ); - assert!( - start_chat("client", "thread", " ", None, None, None, None, None) - .await - .expect_err("empty message rejected") - .contains("message") - ); + assert!(start_chat( + "", + "thread", + "hello", + None, + None, + None, + None, + None, + ChatRequestMetadata::default() + ) + .await + .expect_err("empty client rejected") + .contains("client_id")); + assert!(start_chat( + "client", + "", + "hello", + None, + None, + None, + None, + None, + ChatRequestMetadata::default() + ) + .await + .expect_err("empty thread rejected") + .contains("thread_id")); + assert!(start_chat( + "client", + "thread", + " ", + None, + None, + None, + None, + None, + ChatRequestMetadata::default() + ) + .await + .expect_err("empty message rejected") + .contains("message")); let mut rx = subscribe_web_channel_events(); assert_eq!( @@ -266,6 +290,7 @@ async fn web_channel_validation_cancel_and_classifier_snapshots_are_publicly_exe None, None, None, + ChatRequestMetadata::default(), ) .await; assert!(blocked.is_err()); diff --git a/tests/channels_provider_leftovers_raw_coverage_e2e.rs b/tests/channels_provider_leftovers_raw_coverage_e2e.rs index 6220faf012..192eafafb5 100644 --- a/tests/channels_provider_leftovers_raw_coverage_e2e.rs +++ b/tests/channels_provider_leftovers_raw_coverage_e2e.rs @@ -17,6 +17,7 @@ use axum::{ use openhuman_core::openhuman::channels::providers::telegram::TelegramChannel; use openhuman_core::openhuman::channels::providers::web::{ cancel_chat, start_chat, subscribe_web_channel_events, test_support as web_test_support, + ChatRequestMetadata, }; use openhuman_core::openhuman::channels::providers::yuanbao::{ connection::YuanbaoConnection, YuanbaoChannel, YuanbaoConfig, @@ -347,6 +348,7 @@ async fn web_round19_covers_classifier_variants_and_cancel_cleanup() { None, None, None, + ChatRequestMetadata::default(), ) .await .expect("start forced web chat"); diff --git a/tests/channels_runtime_raw_coverage_e2e.rs b/tests/channels_runtime_raw_coverage_e2e.rs index e02d19e9aa..70d13456f8 100644 --- a/tests/channels_runtime_raw_coverage_e2e.rs +++ b/tests/channels_runtime_raw_coverage_e2e.rs @@ -10,7 +10,7 @@ use axum::{ }; use openhuman_core::core::event_bus::{DomainEvent, EventHandler}; use openhuman_core::openhuman::channels::providers::web::{ - cancel_chat, start_chat, subscribe_web_channel_events, + cancel_chat, start_chat, subscribe_web_channel_events, ChatRequestMetadata, }; use openhuman_core::openhuman::channels::providers::yuanbao::{YuanbaoChannel, YuanbaoConfig}; use openhuman_core::openhuman::channels::{ @@ -372,24 +372,48 @@ async fn yuanbao_public_channel_and_config_paths_are_isolated_from_network() { #[tokio::test] async fn web_channel_validation_cancel_and_event_subscription_are_fast() { - assert!( - start_chat("", "thread", "hello", None, None, None, None, None) - .await - .expect_err("empty client rejected") - .contains("client_id") - ); - assert!( - start_chat("client", "", "hello", None, None, None, None, None) - .await - .expect_err("empty thread rejected") - .contains("thread_id") - ); - assert!( - start_chat("client", "thread", " ", None, None, None, None, None) - .await - .expect_err("empty message rejected") - .contains("message") - ); + assert!(start_chat( + "", + "thread", + "hello", + None, + None, + None, + None, + None, + ChatRequestMetadata::default() + ) + .await + .expect_err("empty client rejected") + .contains("client_id")); + assert!(start_chat( + "client", + "", + "hello", + None, + None, + None, + None, + None, + ChatRequestMetadata::default() + ) + .await + .expect_err("empty thread rejected") + .contains("thread_id")); + assert!(start_chat( + "client", + "thread", + " ", + None, + None, + None, + None, + None, + ChatRequestMetadata::default() + ) + .await + .expect_err("empty message rejected") + .contains("message")); let mut rx = subscribe_web_channel_events(); assert_eq!( @@ -409,6 +433,7 @@ async fn web_channel_validation_cancel_and_event_subscription_are_fast() { None, None, None, + ChatRequestMetadata::default(), ) .await; assert!( diff --git a/tests/channels_web_startup_raw_coverage_e2e.rs b/tests/channels_web_startup_raw_coverage_e2e.rs index d4e1849bf0..55682b95fe 100644 --- a/tests/channels_web_startup_raw_coverage_e2e.rs +++ b/tests/channels_web_startup_raw_coverage_e2e.rs @@ -13,7 +13,7 @@ use openhuman_core::openhuman::channels::test_support::{ use openhuman_core::openhuman::channels::web::{ all_web_channel_controller_schemas, all_web_channel_registered_controllers, channel_web_cancel, channel_web_chat, schemas, start_chat, subscribe_web_channel_events, - test_support as web_test_support, + test_support as web_test_support, ChatRequestMetadata, }; use openhuman_core::openhuman::config::Config; use tempfile::tempdir; @@ -113,9 +113,19 @@ async fn web_controllers_validate_inputs_and_emit_structured_forced_errors() { assert_eq!(all_web_channel_registered_controllers().len(), 4); assert_eq!(schemas("missing").function, "unknown"); - let err = channel_web_chat("client", "thread", " ", None, None, None, None, None) - .await - .expect_err("blank messages are rejected"); + let err = channel_web_chat( + "client", + "thread", + " ", + None, + None, + None, + None, + None, + ChatRequestMetadata::default(), + ) + .await + .expect_err("blank messages are rejected"); assert!(err.contains("message is required")); let cancel = channel_web_cancel("client", "missing-thread") @@ -140,6 +150,7 @@ async fn web_controllers_validate_inputs_and_emit_structured_forced_errors() { None, Some("zh-CN".to_string()), None, + ChatRequestMetadata::default(), ) .await .expect("chat request accepted") @@ -185,6 +196,7 @@ async fn web_chat_cancel_aborts_in_flight_thread_without_real_provider() { None, None, None, + ChatRequestMetadata::default(), ) .await .expect("start chat"); diff --git a/tests/channels_web_telegram_raw_coverage_e2e.rs b/tests/channels_web_telegram_raw_coverage_e2e.rs index 30ccf3709c..61e6dfa77b 100644 --- a/tests/channels_web_telegram_raw_coverage_e2e.rs +++ b/tests/channels_web_telegram_raw_coverage_e2e.rs @@ -18,7 +18,7 @@ use openhuman_core::core::event_bus::{init_global, publish_global, DomainEvent}; use openhuman_core::openhuman::channels::providers::telegram::TelegramChannel; use openhuman_core::openhuman::channels::providers::web::{ cancel_chat, register_approval_surface_subscriber, start_chat, subscribe_web_channel_events, - test_support as web_test_support, + test_support as web_test_support, ChatRequestMetadata, }; use openhuman_core::openhuman::channels::providers::yuanbao::{YuanbaoChannel, YuanbaoConfig}; use openhuman_core::openhuman::channels::LarkChannel; @@ -300,6 +300,7 @@ async fn web_channel_approval_bridge_forced_errors_and_newer_request_cancellatio Some("missing-profile".to_string()), Some("en-US".to_string()), None, + ChatRequestMetadata::default(), ) .await .expect("forced chat accepted"); @@ -325,6 +326,7 @@ async fn web_channel_approval_bridge_forced_errors_and_newer_request_cancellatio None, None, None, + ChatRequestMetadata::default(), ) .await .expect("first chat accepted"); @@ -337,6 +339,7 @@ async fn web_channel_approval_bridge_forced_errors_and_newer_request_cancellatio None, None, None, + ChatRequestMetadata::default(), ) .await .expect("second chat accepted"); diff --git a/tests/channels_web_yuanbao_round22_raw_coverage_e2e.rs b/tests/channels_web_yuanbao_round22_raw_coverage_e2e.rs index 377ffd1efb..98d34ca6b3 100644 --- a/tests/channels_web_yuanbao_round22_raw_coverage_e2e.rs +++ b/tests/channels_web_yuanbao_round22_raw_coverage_e2e.rs @@ -15,6 +15,7 @@ use axum::{ use openhuman_core::openhuman::channels::providers::telegram::TelegramChannel; use openhuman_core::openhuman::channels::providers::web::{ cancel_chat, start_chat, subscribe_web_channel_events, test_support as web_test_support, + ChatRequestMetadata, }; use openhuman_core::openhuman::channels::providers::yuanbao::{ connection::test_support as yuanbao_connection_test_support, @@ -189,15 +190,35 @@ fn isolated_config() -> (tempfile::TempDir, Config) { #[tokio::test] async fn web_start_chat_validation_forced_error_and_cancel_paths_are_structured() { assert_eq!( - start_chat(" ", "thread", "hello", None, None, None, None, None) - .await - .unwrap_err(), + start_chat( + " ", + "thread", + "hello", + None, + None, + None, + None, + None, + ChatRequestMetadata::default() + ) + .await + .unwrap_err(), "client_id is required" ); assert_eq!( - start_chat("client", " ", "hello", None, None, None, None, None) - .await - .unwrap_err(), + start_chat( + "client", + " ", + "hello", + None, + None, + None, + None, + None, + ChatRequestMetadata::default() + ) + .await + .unwrap_err(), "thread_id is required" ); @@ -215,6 +236,7 @@ async fn web_start_chat_validation_forced_error_and_cancel_paths_are_structured( None, None, None, + ChatRequestMetadata::default(), ) .await .expect("accepted"); diff --git a/tests/json_rpc_e2e.rs b/tests/json_rpc_e2e.rs index f5fde0b3ca..3dd11a01f3 100644 --- a/tests/json_rpc_e2e.rs +++ b/tests/json_rpc_e2e.rs @@ -10487,6 +10487,127 @@ async fn json_rpc_workflows_lifecycle_round_trip() { rpc_join.abort(); } +/// Task 4 / #3090: when a web-chat request is sent with +/// `speak_reply: true`, `run_chat_task` should drive the agent's final text +/// through `voice::reply_speech::synthesize_reply` after the turn completes. +/// +/// We activate the [`reply_speech::test_seam`] short-circuit via the +/// `OPENHUMAN_TEST_REPLY_SPEECH_SEAM` env var so the call is recorded +/// without contacting the ElevenLabs proxy. +#[tokio::test] +async fn json_rpc_channel_web_chat_with_speak_reply_invokes_reply_speech() { + let _env_lock = json_rpc_e2e_env_lock(); + let tmp = tempdir().expect("tempdir"); + let home = tmp.path(); + let openhuman_home = home.join(".openhuman"); + + let _home_guard = EnvVarGuard::set_to_path("HOME", home); + let _workspace_guard = EnvVarGuard::unset("OPENHUMAN_WORKSPACE"); + let _backend_url_guard = EnvVarGuard::unset("BACKEND_URL"); + let _vite_backend_guard = EnvVarGuard::unset("VITE_BACKEND_URL"); + // Activate the reply_speech test seam so synthesize_reply records and + // short-circuits instead of calling the hosted backend. + let _seam_guard = EnvVarGuard::set( + openhuman_core::openhuman::voice::reply_speech::TEST_SEAM_ENV, + "1", + ); + + openhuman_core::openhuman::voice::reply_speech::test_seam::clear(); + + let (mock_addr, mock_join) = serve_on_ephemeral(mock_upstream_router()).await; + let mock_origin = format!("http://{}", mock_addr); + + write_min_config(&openhuman_home, &mock_origin); + let user_scoped_dir = openhuman_home.join("users").join("e2e-user"); + write_min_config(&user_scoped_dir, &mock_origin); + + let (rpc_addr, rpc_join) = serve_on_ephemeral(build_core_http_router(false)).await; + let rpc_base = format!("http://{}", rpc_addr); + tokio::time::sleep(Duration::from_millis(100)).await; + + // Authenticate so the agent loop has a session token available. + let store = post_json_rpc( + &rpc_base, + 9300, + "openhuman.auth_store_session", + json!({ + "token": "e2e-test-jwt", + "user_id": "e2e-user" + }), + ) + .await; + assert_no_jsonrpc_error(&store, "store_session"); + + let client_id = "ptt-e2e-client"; + let thread_id = "ptt-e2e-thread"; + let events_url = format!("{}/events?client_id={}", rpc_base, client_id); + let sse_task = tokio::spawn(async move { read_terminal_web_chat_event(&events_url).await }); + + // PTT-style chat send: speak_reply=true, source=ptt, session_id=1. + let web_chat = post_json_rpc( + &rpc_base, + 9301, + "openhuman.channel_web_chat", + json!({ + "client_id": client_id, + "thread_id": thread_id, + "message": "Hello from PTT", + "model_override": "e2e-mock-model", + "speak_reply": true, + "source": "ptt", + "session_id": 1, + }), + ) + .await; + let web_chat_result = assert_no_jsonrpc_error(&web_chat, "channel_web_chat"); + assert_eq!( + web_chat_result + .get("result") + .and_then(|v| v.get("accepted")), + Some(&json!(true)) + ); + + let sse_event = tokio::time::timeout(Duration::from_secs(12), sse_task) + .await + .expect("timed out waiting for chat_done with speak_reply=true") + .expect("sse task join should succeed"); + assert_eq!( + sse_event.get("event").and_then(Value::as_str), + Some("chat_done") + ); + + // The bridge should have buffered the streamed assistant text and + // routed it through synthesize_reply on TurnCompleted. Poll briefly + // because the bridge task may finish slightly after chat_done. + let mut observed: Vec = Vec::new(); + for _ in 0..50 { + observed = openhuman_core::openhuman::voice::reply_speech::test_seam::observed(); + if !observed.is_empty() { + break; + } + tokio::time::sleep(Duration::from_millis(20)).await; + } + assert!( + !observed.is_empty(), + "expected reply_speech::synthesize_reply to be invoked when speak_reply=true; observed={observed:?}" + ); + assert!( + observed.iter().any(|t| !t.trim().is_empty()), + "expected at least one non-empty text passed to synthesize_reply; observed={observed:?}" + ); + assert!( + observed + .iter() + .any(|t| t.contains("Hello from e2e mock agent")), + "expected the observed seam text to include the mock reply phrase; got {observed:?}" + ); + + mock_join.abort(); + rpc_join.abort(); +} + +// ── Model resolution + agent profile switching ────────────────────────── + /// E2E: voice-server settings round-trip over JSON-RPC — Phase 2 always-on /// toggle + "Hey Tiny" wake word. Regression guard for the bug where the /// Settings toggle silently did nothing because `always_on_enabled` was absent diff --git a/tests/tools_approval_channels_raw_coverage_e2e.rs b/tests/tools_approval_channels_raw_coverage_e2e.rs index 5c9005a4eb..10e28962cd 100644 --- a/tests/tools_approval_channels_raw_coverage_e2e.rs +++ b/tests/tools_approval_channels_raw_coverage_e2e.rs @@ -2073,7 +2073,15 @@ async fn web_channel_public_paths_cover_event_delivery_and_validation_errors() { assert_eq!( openhuman_core::openhuman::channels::web::start_chat( - "", "thread-1", "hello", None, None, None, None, None, + "", + "thread-1", + "hello", + None, + None, + None, + None, + None, + openhuman_core::openhuman::channels::web::ChatRequestMetadata::default(), ) .await .expect_err("blank client_id"), @@ -2081,7 +2089,15 @@ async fn web_channel_public_paths_cover_event_delivery_and_validation_errors() { ); assert_eq!( openhuman_core::openhuman::channels::web::start_chat( - "client-1", "", "hello", None, None, None, None, None, + "client-1", + "", + "hello", + None, + None, + None, + None, + None, + openhuman_core::openhuman::channels::web::ChatRequestMetadata::default(), ) .await .expect_err("blank thread_id"), @@ -2089,7 +2105,15 @@ async fn web_channel_public_paths_cover_event_delivery_and_validation_errors() { ); assert_eq!( openhuman_core::openhuman::channels::web::start_chat( - "client-1", "thread-1", " ", None, None, None, None, None, + "client-1", + "thread-1", + " ", + None, + None, + None, + None, + None, + openhuman_core::openhuman::channels::web::ChatRequestMetadata::default(), ) .await .expect_err("blank message"), diff --git a/tests/tools_network_channels_raw_coverage_e2e.rs b/tests/tools_network_channels_raw_coverage_e2e.rs index 26f759fa7e..1577235c10 100644 --- a/tests/tools_network_channels_raw_coverage_e2e.rs +++ b/tests/tools_network_channels_raw_coverage_e2e.rs @@ -21,7 +21,7 @@ use openhuman_core::core::socketio::WebChannelEvent; use openhuman_core::openhuman::channels::providers::web::{ all_web_channel_controller_schemas, all_web_channel_registered_controllers, cancel_chat, channel_web_cancel, publish_web_channel_event, schemas as web_channel_schema, start_chat, - subscribe_web_channel_events, + subscribe_web_channel_events, ChatRequestMetadata, }; use openhuman_core::openhuman::config::{ AutonomyConfig, Config, PolymarketClobCredentials, PolymarketConfig, @@ -556,9 +556,19 @@ async fn web_channel_public_paths_cover_validation_cancel_schema_and_event_bus() assert_eq!(web_channel_schema("cancel").function, "web_cancel"); assert_eq!(web_channel_schema("missing").function, "unknown"); - let missing_client = start_chat(" ", "thread", "hello", None, None, None, None, None) - .await - .expect_err("blank client"); + let missing_client = start_chat( + " ", + "thread", + "hello", + None, + None, + None, + None, + None, + ChatRequestMetadata::default(), + ) + .await + .expect_err("blank client"); assert_contains(&missing_client, "client_id is required"); let missing_thread = cancel_chat("client", " ").await.expect_err("blank thread"); assert_contains(&missing_thread, "thread_id is required");