diff --git a/MANIFEST.in b/MANIFEST.in index acc5aaea8..68dbf359e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -18,6 +18,10 @@ include README.md # util/verify_wheel_dist.py asserts the wheel actually contains these files. recursive-include src/gaia/apps/webui/dist *.html *.js *.css *.svg *.png *.jpg *.jpeg *.webp *.ico *.webmanifest *.json *.woff *.woff2 *.ttf *.txt +# Voice-test harness served by the audio router at /voice/test. Standalone +# HTML page (no bundling, no React); ships as a single file. +recursive-include src/gaia/ui/static *.html + # Backstop deny-list — these patterns must never reach a published wheel even # if a developer accidentally checks them in or `npm run build` emits them. prune src/gaia/apps/webui/dist/node_modules diff --git a/setup.py b/setup.py index f6f5ed50a..46857cff2 100644 --- a/setup.py +++ b/setup.py @@ -103,6 +103,11 @@ "dist/*.txt", "dist/assets/*", ], + # Voice-test harness for the audio router (/voice/test). Standalone + # HTML page with the WAV converter + mic recorder, no JS bundle needed. + "gaia.ui": [ + "static/*.html", + ], }, install_requires=[ "openai", diff --git a/src/gaia/audio/lemonade_audio.py b/src/gaia/audio/lemonade_audio.py new file mode 100644 index 000000000..f8a85b296 --- /dev/null +++ b/src/gaia/audio/lemonade_audio.py @@ -0,0 +1,273 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +""" +Lemonade-routed audio: speech-to-text and text-to-speech via the OpenAI- +compatible /v1/audio endpoints exposed by Lemonade Server. + +Why this module exists alongside whisper_asr.py and kokoro_tts.py: + Those modules import the `whisper` and `kokoro` Python packages and run the + models in-process. That means each GAIA process loads its own copy of those + models — wasteful when Lemonade is already running for the LLM. Lemonade + exposes Whisper and Kokoro as REST endpoints (port 13305 by default), so a + single Lemonade instance can serve LLM + STT + TTS to multiple clients. + + This module is the thin HTTP client that GAIA's Agent UI (and downstream + consumers like Beacon) use to talk to those endpoints. The original + whisper_asr / kokoro_tts modules remain for use cases that need in-process + execution (e.g., the `gaia talk` standalone CLI without a running server). + +Endpoints: + POST /v1/audio/transcriptions multipart: file=, model=Whisper-Small + → {"text": "..."} + POST /v1/audio/speech JSON: {model:"kokoro-v1", input, voice, + response_format, speed} + → raw audio bytes (mp3/wav/opus/pcm) + WS /realtime streaming STT (OpenAI realtime-compatible); + not yet wrapped here. + +Models auto-download on first request (~30s for Whisper-Small). +""" + +from __future__ import annotations +import os +from pathlib import Path + +import httpx + + +LEMONADE_URL = os.getenv("LEMONADE_BASE_URL", "http://localhost:13305") + +DEFAULT_STT_MODEL = "Whisper-Small" # English; lighter+faster than -Large +DEFAULT_TTS_MODEL = "kokoro-v1" # only TTS model Lemonade exposes today +DEFAULT_TTS_VOICE = "shimmer" + + +class LemonadeAudioError(RuntimeError): + """Raised on Lemonade audio-endpoint failures. + + GAIA's no-silent-fallback policy applies: callers must handle this error + explicitly (retry, surface to user, fall back to text input). We do NOT + silently fall back to whisper_asr / kokoro_tts — those modules have a + different operational contract (in-process model loading) and using them + as a fallback would mask real Lemonade misconfiguration. + """ + + +# ────────────────────────── Speech-to-text ────────────────────────── +def transcribe( + audio_path: str | Path, + *, + model: str = DEFAULT_STT_MODEL, + language: str | None = "en", + base_url: str = LEMONADE_URL, + timeout: float = 60.0, +) -> str: + """POST a WAV file to Lemonade /v1/audio/transcriptions. + + Args: + audio_path: path to a 16kHz mono WAV file (push-to-talk recordings). + model: ``Whisper-Tiny`` | ``Whisper-Base`` | ``Whisper-Small`` | + ``Whisper-Large`` (or any other Whisper variant Lemonade serves). + language: ISO 639-1 code; defaults to ``"en"``. Pass ``None`` to + auto-detect. + base_url: Lemonade server URL. + timeout: HTTP timeout in seconds. + + Returns: + The transcribed text. + + Raises: + FileNotFoundError: if ``audio_path`` does not exist. + LemonadeAudioError: server unreachable, non-200 status, or malformed + response. + """ + audio_path = Path(audio_path) + if not audio_path.exists(): + raise FileNotFoundError(audio_path) + + files = {"file": (audio_path.name, audio_path.read_bytes(), "audio/wav")} + data: dict[str, str] = {"model": model} + if language is not None: + data["language"] = language + + try: + r = httpx.post( + f"{base_url}/v1/audio/transcriptions", + files=files, + data=data, + timeout=timeout, + ) + except httpx.RequestError as e: + raise LemonadeAudioError( + f"Lemonade STT unreachable at {base_url} — start the server with " + f"`lemonade-server serve`, or set LEMONADE_BASE_URL. Original: {e}" + ) from e + + if r.status_code != 200: + raise LemonadeAudioError( + f"Lemonade STT returned {r.status_code}: {r.text[:200]}. " + f"Common causes: model '{model}' not yet downloaded " + f"(first request triggers a ~30s auto-download), or audio not WAV/16kHz mono." + ) + + body = r.json() + if "text" not in body: + raise LemonadeAudioError(f"Unexpected STT response shape: {body!r}") + # Lemonade returns {"text": null} when no speech was detected; normalize + # to empty string so downstream string-handling doesn't NPE. + return body["text"] or "" + + +def transcribe_bytes( + audio_bytes: bytes, + filename: str = "audio.wav", + *, + model: str = DEFAULT_STT_MODEL, + language: str | None = "en", + base_url: str = LEMONADE_URL, + timeout: float = 60.0, +) -> str: + """Like :func:`transcribe` but takes raw WAV bytes (no temp file needed). + + Useful for FastAPI handlers that accept :class:`UploadFile` and want to + forward the bytes directly to Lemonade without disk I/O. + """ + files = {"file": (filename, audio_bytes, "audio/wav")} + data: dict[str, str] = {"model": model} + if language is not None: + data["language"] = language + + try: + r = httpx.post( + f"{base_url}/v1/audio/transcriptions", + files=files, + data=data, + timeout=timeout, + ) + except httpx.RequestError as e: + raise LemonadeAudioError( + f"Lemonade STT unreachable at {base_url} — start the server with " + f"`lemonade-server serve`, or set LEMONADE_BASE_URL. Original: {e}" + ) from e + + if r.status_code != 200: + raise LemonadeAudioError( + f"Lemonade STT returned {r.status_code}: {r.text[:200]}" + ) + body = r.json() + if "text" not in body: + raise LemonadeAudioError(f"Unexpected STT response shape: {body!r}") + # Lemonade returns {"text": null} when no speech was detected; normalize + # to empty string so downstream string-handling doesn't NPE. + return body["text"] or "" + + +# ────────────────────────── Text-to-speech ────────────────────────── +def synthesize( + text: str, + out_path: str | Path, + *, + voice: str = DEFAULT_TTS_VOICE, + model: str = DEFAULT_TTS_MODEL, + response_format: str = "mp3", + speed: float = 1.0, + base_url: str = LEMONADE_URL, + timeout: float = 60.0, +) -> str: + """POST text to Lemonade /v1/audio/speech and write the audio bytes. + + Args: + text: text to synthesize. Keep ≤ ~500 chars for low-latency replies. + out_path: file path to write the audio bytes to. + voice: OpenAI voices (``"alloy"``, ``"shimmer"``, ``"ash"``, …) or + Kokoro voices (``"af_sky"``, ``"am_echo"``, …). + model: must be ``"kokoro-v1"`` as of Lemonade v9.4. + response_format: ``"mp3"`` | ``"wav"`` | ``"opus"`` | ``"pcm"``. + speed: 0.25–4.0 (default 1.0). + base_url: Lemonade server URL. + timeout: HTTP timeout in seconds. + + Returns: + Absolute path string of the written file. + """ + out_path = Path(out_path) + audio_bytes = synthesize_bytes( + text, + voice=voice, + model=model, + response_format=response_format, + speed=speed, + base_url=base_url, + timeout=timeout, + ) + out_path.write_bytes(audio_bytes) + return str(out_path) + + +def synthesize_bytes( + text: str, + *, + voice: str = DEFAULT_TTS_VOICE, + model: str = DEFAULT_TTS_MODEL, + response_format: str = "mp3", + speed: float = 1.0, + base_url: str = LEMONADE_URL, + timeout: float = 60.0, +) -> bytes: + """Like :func:`synthesize` but returns the audio bytes (no file write). + + Useful for FastAPI handlers that stream the audio directly back to the + client without touching disk. + """ + payload = { + "model": model, + "input": text, + "voice": voice, + "response_format": response_format, + "speed": speed, + } + try: + r = httpx.post( + f"{base_url}/v1/audio/speech", + json=payload, + timeout=timeout, + ) + except httpx.RequestError as e: + raise LemonadeAudioError( + f"Lemonade TTS unreachable at {base_url} — start the server with " + f"`lemonade-server serve`, or set LEMONADE_BASE_URL. Original: {e}" + ) from e + + if r.status_code != 200: + raise LemonadeAudioError( + f"Lemonade TTS returned {r.status_code}: {r.text[:200]}" + ) + return r.content + + +# ────────────────────────── Health probe ────────────────────────── +def lemonade_health(base_url: str = LEMONADE_URL, timeout: float = 5.0) -> dict: + """Probe Lemonade health and return the JSON body. + + Lemonade exposes the health endpoint at ``/api/v1/health`` (its native + namespace). Some installations also serve it at ``/v1/health`` and ``/health`` + for compatibility, but ``/api/v1/health`` is what the rest of GAIA uses + (see ``gaia.llm.lemonade_client.LemonadeClient.get_health``). + """ + try: + r = httpx.get(f"{base_url}/api/v1/health", timeout=timeout) + except httpx.RequestError as e: + raise LemonadeAudioError( + f"Lemonade unreachable at {base_url}. Start it with " + f"`lemonade-server serve` or set LEMONADE_BASE_URL. Original: {e}" + ) from e + if r.status_code != 200: + raise LemonadeAudioError( + f"Lemonade /api/v1/health returned {r.status_code}: {r.text[:200]}" + ) + try: + return r.json() + except ValueError as e: + raise LemonadeAudioError( + f"Lemonade /api/v1/health returned non-JSON: {r.text[:200]}" + ) from e diff --git a/src/gaia/ui/routers/audio.py b/src/gaia/ui/routers/audio.py new file mode 100644 index 000000000..53fb0f035 --- /dev/null +++ b/src/gaia/ui/routers/audio.py @@ -0,0 +1,300 @@ +# Copyright(C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT +""" +Audio router — exposes STT (/voice/transcribe), TTS (/voice/speech), a +health probe (/voice/health), and a browser test page (/voice/test) so +contributors can verify the audio path end-to-end without writing code. + +Two backends are supported: + + - ``lemonade`` (default) — routes to :mod:`gaia.audio.lemonade_audio`, + which POSTs to Lemonade Server's OpenAI-compatible /v1/audio/* endpoints. + Single inference server for LLM + STT + TTS. Required for the + ruggedized-Ryzen-AI fielding story. **Currently does not work on macOS** + (Lemonade's ``whispercpp`` recipe is Linux/Windows-only as of v10.2). + + - ``in-process`` — falls through to the legacy + :class:`gaia.audio.whisper_asr.WhisperAsr` and + :class:`gaia.audio.kokoro_tts.KokoroTTS` classes which load the + ``openai-whisper`` and ``kokoro`` Python packages locally. Heavier + install footprint (torch, CUDA wheels, spaCy) but works on macOS where + Lemonade audio doesn't. + +Backend is selected at request time via the ``GAIA_VOICE_BACKEND`` env var +(``lemonade`` or ``in-process``; default ``lemonade``). Both backends are +shipped together until Lemonade adds macOS support for whispercpp / Kokoro; +no silent fallback between them — if the selected backend is unreachable +or its deps are missing, the route returns a clear error. +""" + +from __future__ import annotations + +import io +import logging +import os +import tempfile +from pathlib import Path + +from fastapi import APIRouter, File, Form, HTTPException, UploadFile +from fastapi.responses import HTMLResponse, Response +from pydantic import BaseModel, Field + +from gaia.audio.lemonade_audio import ( + DEFAULT_STT_MODEL, + DEFAULT_TTS_MODEL, + DEFAULT_TTS_VOICE, + LemonadeAudioError, + lemonade_health, + synthesize_bytes, + transcribe_bytes, +) + +logger = logging.getLogger(__name__) + +router = APIRouter(prefix="/voice", tags=["audio"]) + + +class SpeechRequest(BaseModel): + """Body schema for POST /voice/speech (mirrors Lemonade's contract).""" + + input: str = Field(..., min_length=1, description="Text to synthesize") + voice: str = Field(DEFAULT_TTS_VOICE, description="OpenAI or Kokoro voice name") + model: str = Field(DEFAULT_TTS_MODEL, description="TTS model (only kokoro-v1 today)") + response_format: str = Field("mp3", description="mp3 | wav | opus | pcm") + speed: float = Field(1.0, ge=0.25, le=4.0) + + +# ────────────────────────── Backend selector ────────────────────────── +_LEMONADE = "lemonade" +_IN_PROCESS = "in-process" + + +def _backend() -> str: + """Resolve the requested voice backend. Default ``lemonade``.""" + val = os.getenv("GAIA_VOICE_BACKEND", _LEMONADE).lower() + if val not in (_LEMONADE, _IN_PROCESS): + logger.warning( + "Unknown GAIA_VOICE_BACKEND=%r; falling back to %r", val, _LEMONADE + ) + return _LEMONADE + return val + + +# Map Lemonade STT model names → openai-whisper-package names. Used when +# the in-process backend is selected. +_WHISPER_MODEL_TO_PACKAGE = { + "Whisper-Tiny": "tiny", + "Whisper-Base": "base", + "Whisper-Small": "small", + "Whisper-Large": "large", +} + + +def _to_whisper_package_name(name: str) -> str: + """Lemonade name → in-process whisper-package name. Pass-through if already short.""" + return _WHISPER_MODEL_TO_PACKAGE.get(name, name.lower()) + + +# ────────────────────────── /voice/transcribe (STT) ────────────────────────── +@router.post("/transcribe") +async def voice_transcribe( + audio: UploadFile = File(..., description="Audio file (WAV preferred, 16kHz mono)"), + model: str = Form(DEFAULT_STT_MODEL), + language: str | None = Form("en"), +): + """Transcribe an uploaded audio clip via the configured backend. + + Returns ``{"text": "", "model": "", "backend": ""}``. + """ + audio_bytes = await audio.read() + if not audio_bytes: + raise HTTPException(status_code=400, detail="empty audio upload") + + backend = _backend() + + if backend == _IN_PROCESS: + # WhisperAsr loads the openai-whisper model in-process. Heavier on + # cold start (~3-10s for the model load on first call), but works + # on macOS where Lemonade's whispercpp recipe currently does not. + try: + from gaia.audio.whisper_asr import WhisperAsr + except ImportError as e: + raise HTTPException( + status_code=503, + detail=( + "in-process voice backend missing deps — install with " + '`uv pip install -e ".[talk]"`. ' + str(e) + ), + ) from e + + package_name = _to_whisper_package_name(model) + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: + tmp.write(audio_bytes) + tmp_path = tmp.name + try: + asr = WhisperAsr(model_size=package_name) + text = asr.transcribe_file(tmp_path) + except FileNotFoundError as e: + raise HTTPException(status_code=400, detail=str(e)) from e + except ImportError as e: + raise HTTPException(status_code=503, detail=str(e)) from e + finally: + try: + os.unlink(tmp_path) + except OSError: + pass + return {"text": text, "model": model, "backend": _IN_PROCESS} + + # Default: Lemonade-routed path. + try: + text = transcribe_bytes( + audio_bytes, + filename=audio.filename or "audio.wav", + model=model, + language=language or None, + ) + except LemonadeAudioError as e: + # 502 = upstream Lemonade failure (we forwarded faithfully but it errored). + raise HTTPException(status_code=502, detail=str(e)) from e + return {"text": text, "model": model, "backend": _LEMONADE} + + +# ────────────────────────── /voice/speech (TTS) ────────────────────────── +@router.post("/speech") +async def voice_speech(req: SpeechRequest): + """Synthesize speech via the configured backend. + + Body validated by :class:`SpeechRequest`. Returns the raw audio bytes + with a Content-Type matching ``response_format``. + + In-process backend caveat: Kokoro produces float32 audio at 24 kHz; we + encode it server-side as WAV regardless of ``response_format`` (no MP3 + encoder is bundled with the in-process path). The response sets + ``Content-Type: audio/wav`` in that case so the browser plays it correctly. + """ + backend = _backend() + + if backend == _IN_PROCESS: + try: + from gaia.audio.kokoro_tts import KokoroTTS + except ImportError as e: + raise HTTPException( + status_code=503, + detail=( + "in-process voice backend missing deps — install with " + '`uv pip install -e ".[talk]"`. ' + str(e) + ), + ) from e + + try: + import soundfile as sf + except ImportError as e: + raise HTTPException( + status_code=503, + detail="`soundfile` is required to encode in-process TTS output to WAV. " + str(e), + ) from e + + try: + tts = KokoroTTS() + # KokoroTTS catalog uses Kokoro-native names ("af_bella" etc.). If + # the caller passed an OpenAI voice ("shimmer"), Kokoro will fall + # back to whatever default it has — we set the requested voice + # explicitly so the failure mode is the caller's, not ours. + tts.set_voice(req.voice) + audio_array, _phonemes, meta = tts.generate_speech(req.input) + except ImportError as e: + raise HTTPException(status_code=503, detail=str(e)) from e + except Exception as e: # noqa: BLE001 — Kokoro raises various errors + raise HTTPException(status_code=502, detail=f"in-process TTS failed: {e}") from e + + sample_rate = meta.get("sample_rate", 24_000) + buf = io.BytesIO() + sf.write(buf, audio_array, samplerate=sample_rate, format="WAV", subtype="PCM_16") + return Response(content=buf.getvalue(), media_type="audio/wav") + + # Default: Lemonade-routed path. + try: + audio = synthesize_bytes( + req.input, + voice=req.voice, + model=req.model, + response_format=req.response_format, + speed=req.speed, + ) + except LemonadeAudioError as e: + raise HTTPException(status_code=502, detail=str(e)) from e + + media = { + "mp3": "audio/mpeg", + "wav": "audio/wav", + "opus": "audio/opus", + # PCM has no universally-played MIME without sample-rate parameter; + # clients asking for raw PCM are expected to know the format. + "pcm": "application/octet-stream", + }.get(req.response_format, "application/octet-stream") + return Response(content=audio, media_type=media) + + +# ────────────────────────── /voice/health ────────────────────────── +@router.get("/health") +def voice_health(): + """Report the active backend and probe its readiness. + + For the ``lemonade`` backend, this proxies to Lemonade's /api/v1/health. + For ``in-process``, it just confirms the local imports resolve. + """ + backend = _backend() + if backend == _IN_PROCESS: + deps_ok = True + detail: str | None = None + try: + import whisper # noqa: F401 (required by WhisperAsr) + import kokoro # noqa: F401 (required by KokoroTTS) + except ImportError as e: + deps_ok = False + detail = ( + 'in-process backend missing deps — install with `uv pip install -e ".[talk]"`. ' + + str(e) + ) + return { + "backend": _IN_PROCESS, + "ready": deps_ok, + "detail": detail, + "stt_default": "small", + "tts_default": "af_bella", + } + + # Default: lemonade + try: + body = lemonade_health() + except LemonadeAudioError as e: + raise HTTPException(status_code=502, detail=str(e)) from e + return { + "backend": _LEMONADE, + "lemonade": body, + "stt_default": DEFAULT_STT_MODEL, + "tts_default": DEFAULT_TTS_MODEL, + } + + +# ────────────────────────── /voice/test (browser harness) ────────────────────────── +_TEST_HTML_PATH = Path(__file__).parent.parent / "static" / "voice_test.html" + + +@router.get("/test", response_class=HTMLResponse) +def voice_test_page(): + """Serve a single-page browser harness for STT + TTS smoke testing. + + Open in a browser at ``http://localhost:/voice/test``. The page + auto-probes /voice/health on load and displays which backend is active. + """ + try: + return HTMLResponse(_TEST_HTML_PATH.read_text(encoding="utf-8")) + except FileNotFoundError as e: + raise HTTPException( + status_code=500, + detail=( + f"voice_test.html missing at {_TEST_HTML_PATH}. " + "Reinstall gaia or restore src/gaia/ui/static/voice_test.html." + ), + ) from e diff --git a/src/gaia/ui/server.py b/src/gaia/ui/server.py index ead8d38cf..6bfe673c0 100644 --- a/src/gaia/ui/server.py +++ b/src/gaia/ui/server.py @@ -49,6 +49,7 @@ from .database import ChatDatabase from .document_monitor import DocumentMonitor from .routers import agents as agents_router_mod +from .routers import audio as audio_router_mod from .routers import chat as chat_router_mod from .routers import documents as documents_router_mod from .routers import files as files_router_mod @@ -395,6 +396,7 @@ async def _global_exception_handler(request: Request, exc: Exception): app.include_router(files_router_mod.router) app.include_router(tunnel_router_mod.router) app.include_router(mcp_router_mod.router) + app.include_router(audio_router_mod.router) # ── Serve Uploaded Files ───────────────────────────────────────────── # Mount the uploads directory so uploaded files can be served by URL. diff --git a/src/gaia/ui/static/voice_test.html b/src/gaia/ui/static/voice_test.html new file mode 100644 index 000000000..3d281ac07 --- /dev/null +++ b/src/gaia/ui/static/voice_test.html @@ -0,0 +1,308 @@ + + + + + +GAIA — Voice Test (Lemonade STT/TTS) + + + +

🍋 GAIA Voice Test — STT/TTS

+

Smoke-test the audio pipeline through GAIA's /voice/* endpoints. Backend is selected via the GAIA_VOICE_BACKEND env var (lemonade default, in-process for macOS / when Lemonade audio recipes are unavailable).

+

Active backend (from /voice/health): checking…

+ +
+

Health

+ +
not checked yet
+ +
+ +
+

1 · Speech-to-Text

+

Click record, speak for ~5 seconds, click stop. Audio uploads to POST /voice/transcribe which forwards to Lemonade.

+
+ + +
+
+ + +
+
+ + idle +
+
(transcription will appear here)
+ +
+ +
+

2 · Text-to-Speech

+

Synthesize text via POST /voice/speech (forwards to Lemonade Kokoro).

+
+ +
+ +
+ + +
+
+ + +
+
+ + +
+
+ + not run +
+ +
+ +

Endpoints under test: POST /voice/transcribe, POST /voice/speech, GET /voice/health. All proxy to Lemonade /v1/audio/*.

+ + + +