#!/usr/bin/env python3
# Timestamp: "2025-12-11 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex-code/src/scitex/audio/_tts.py
# ----------------------------------------
"""
Text-to-Speech implementation using ElevenLabs API.
This module provides TTS functionality that can be used:
1. Directly via the ElevenLabs Python SDK
2. Via MCP server integration
Environment Variables:
ELEVENLABS_API_KEY: Your ElevenLabs API key
"""
from __future__ import annotations
import os
import subprocess
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
__all__ = ["TTS", "speak"]
@dataclass
class TTSConfig:
"""Configuration for TTS."""
voice_id: str = "pNInz6obpgDQGcFmaJgB" # Adam (default; free-tier compatible)
voice_name: Optional[str] = None
model_id: str = "eleven_multilingual_v2"
stability: float = 0.5
similarity_boost: float = 0.75
style: float = 0.0
speed: float = 1.0
output_format: str = "mp3_44100_128"
[docs]
class TTS:
"""Text-to-Speech using ElevenLabs API.
Examples
--------
# Basic usage
tts = TTS()
tts.speak("Hello, world!")
# With custom voice
tts = TTS(voice_name="Adam")
tts.speak("Processing complete")
# Save to file without playing
tts.speak("Test", output_path="/tmp/test.mp3", play=False)
"""
# Popular voice presets (free-tier premade voices first, paid library voices at end)
VOICES = {
"adam": "pNInz6obpgDQGcFmaJgB",
"sarah": "EXAVITQu4vr4xnSDxMaL",
"laura": "FGY2WhTYpPnrIDTdsKH5",
"charlie": "IKne3meq5aSn9XLyUdCD",
"george": "JBFqnCBsd6RMkjVDRZzb",
"callum": "N2lVS1w4EtoT3dr4eOWO",
"river": "SAz9YHcvj6GT2YYXdXww",
"liam": "TX3LPaxmHKxFdv7VOQHJ",
"alice": "Xb7hH8MSUJpSbSDYk0k2",
"matilda": "XrExE9yKIg1WjnnlVkGX",
"will": "bIHbv24MWmeRgasZH58o",
"jessica": "cgSgspJ2msm6clMCkdW9",
"eric": "cjVigY5qzO86Huf0OWal",
"bella": "hpp4J3VqNfWAUOO0d1Us",
"chris": "iP95p4xoKVk53GoZ742B",
"brian": "nPczCjzI2devNBz1zQrb",
"daniel": "onwK4e9ZLuTAKqWW03F9",
"lily": "pFZP5JQG7iQjIQuC4Bku",
"roger": "CwhRBWXzGAHq8TQ4Fs17",
"harry": "SOYHLrjzK2X1ezoPC6cr",
"rachel": "21m00Tcm4TlvDq8ikWAM",
"antoni": "ErXwobaYiN019PkySvjV",
"domi": "AZnzlk1XvdvUeBnXmlld",
"elli": "MF3mGyEYCl7XYWbV9V6O",
"josh": "TxGEqnHWrfWFTfGW9XjX",
"sam": "yoZ06aMxZJJ28mfd3POQ",
}
[docs]
def __init__(
self,
api_key: Optional[str] = None,
voice_name: Optional[str] = None,
voice_id: Optional[str] = None,
client=None,
client_factory=None,
**kwargs,
):
"""Initialize TTS.
Args:
api_key: ElevenLabs API key. Defaults to ELEVENLABS_API_KEY env var.
voice_name: Voice name (e.g., "Adam", "Sarah", "George" — free-tier).
voice_id: Direct voice ID (overrides voice_name).
client: Optional pre-built client (testing). When given, the
lazy-load is skipped.
client_factory: Optional callable ``(api_key) -> client`` used by
the lazy ``client`` property instead of the real ElevenLabs
SDK (testing). Lets a test exercise the import-error path
without uninstalling the dependency.
**kwargs: Additional config options (stability, speed, etc.)
"""
self.api_key = (
api_key
or os.environ.get("SCITEX_AUDIO_ELEVENLABS_API_KEY")
or os.environ.get("ELEVENLABS_API_KEY")
)
self.config = TTSConfig(**kwargs)
if voice_id:
self.config.voice_id = voice_id
elif voice_name:
self.config.voice_name = voice_name
normalized = voice_name.lower()
if normalized in self.VOICES:
self.config.voice_id = self.VOICES[normalized]
self._client = client
self._client_factory = client_factory
@property
def client(self):
"""Lazy-load ElevenLabs client."""
if self._client is None:
if self._client_factory is not None:
self._client = self._client_factory(self.api_key)
else:
try:
from elevenlabs.client import ElevenLabs
self._client = ElevenLabs(api_key=self.api_key)
except ImportError:
raise ImportError(
"elevenlabs package not installed. "
"Install with: pip install elevenlabs"
)
return self._client
[docs]
def speak(
self,
text: str,
output_path: Optional[str] = None,
play: bool = True,
voice_name: Optional[str] = None,
voice_id: Optional[str] = None,
) -> Optional[Path]:
"""Convert text to speech and optionally play it.
Args:
text: Text to convert to speech.
output_path: Path to save audio file. Auto-generated if None.
play: Whether to play the audio after generation.
voice_name: Override voice name for this call.
voice_id: Override voice ID for this call.
Returns
-------
Path to the generated audio file, or None if only played.
"""
# Determine voice
vid = voice_id or self.config.voice_id
if voice_name and not voice_id:
normalized = voice_name.lower()
vid = self.VOICES.get(normalized, vid)
# Generate audio
audio = self.client.text_to_speech.convert(
text=text,
voice_id=vid,
model_id=self.config.model_id,
voice_settings={
"stability": self.config.stability,
"similarity_boost": self.config.similarity_boost,
"style": self.config.style,
"speed": self.config.speed,
},
output_format=self.config.output_format,
)
# Determine output path
if output_path:
out_path = Path(output_path)
else:
suffix = ".mp3" if "mp3" in self.config.output_format else ".wav"
fd, tmp_path = tempfile.mkstemp(suffix=suffix, prefix="scitex_tts_")
os.close(fd)
out_path = Path(tmp_path)
# Write audio to file
with open(out_path, "wb") as f:
for chunk in audio:
f.write(chunk)
# Play if requested
if play:
self._play_audio(out_path)
return out_path if output_path else None
def _play_audio(self, path: Path, runner=None) -> None:
"""Play audio file using available system player.
Includes Windows fallback for WSL environments.
Args:
path: Audio file to play.
runner: Injectable subprocess runner (testing). A callable with
the ``subprocess.run`` signature; defaults to the real
``subprocess.run`` when ``None``.
"""
run = runner if runner is not None else subprocess.run
# Check if we're in WSL - if so, prefer Windows playback directly
# to avoid double playback issues with Linux audio hanging
if os.path.exists("/mnt/c/Windows"):
if self._play_audio_windows(path):
return
# Fall through to Linux players if Windows playback fails
players = [
["mpv", "--no-video", str(path)],
["ffplay", "-nodisp", "-autoexit", str(path)],
["aplay", str(path)],
["afplay", str(path)], # macOS
]
for player_cmd in players:
try:
run(
player_cmd,
check=True,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=30,
)
return
except subprocess.TimeoutExpired:
# Audio playback hung, don't try more players
return
except (subprocess.CalledProcessError, FileNotFoundError):
continue
print(f"Warning: No audio player found. Audio saved to: {path}")
def _play_audio_windows(self, path: Path) -> bool:
"""Play audio via Windows PowerShell SoundPlayer (WSL fallback).
Uses headless SoundPlayer - no GUI popup.
"""
import shutil
import tempfile
# Check if we're in WSL
if not os.path.exists("/mnt/c/Windows"):
return False
powershell = shutil.which("powershell.exe")
if not powershell:
return False
try:
# SoundPlayer only supports WAV, so convert if needed
wav_path = path
if path.suffix.lower() in (".mp3", ".ogg", ".m4a"):
try:
from pydub import AudioSegment
fd, tmp_wav = tempfile.mkstemp(suffix=".wav", prefix="scitex_")
os.close(fd)
wav_path = Path(tmp_wav)
audio = AudioSegment.from_file(str(path))
audio.export(str(wav_path), format="wav")
except ImportError:
pass
result = subprocess.run(
["wslpath", "-w", str(wav_path)],
capture_output=True,
text=True,
timeout=5,
)
if result.returncode != 0:
return False
windows_path = result.stdout.strip()
ps_command = f"""
$player = New-Object System.Media.SoundPlayer
$player.SoundLocation = "{windows_path}"
$player.PlaySync()
"""
subprocess.run(
[powershell, "-NoProfile", "-Command", ps_command],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=60,
)
# Clean up temp WAV
if wav_path != path and wav_path.exists():
try:
wav_path.unlink()
except Exception:
pass
return True
except Exception:
return False
[docs]
def list_voices(self) -> list:
"""List available voices from ElevenLabs."""
response = self.client.voices.get_all()
return [
{"name": v.name, "voice_id": v.voice_id, "labels": v.labels}
for v in response.voices
]
# Module-level convenience function
_default_tts: Optional[TTS] = None
def speak(
text: str,
voice: Optional[str] = None,
play: bool = True,
output_path: Optional[str] = None,
**kwargs,
) -> Optional[Path]:
"""Convenience function for quick TTS.
Args:
text: Text to speak.
voice: Voice name (e.g., "Rachel", "Adam").
play: Whether to play audio.
output_path: Optional path to save audio.
**kwargs: Additional TTS config options.
Returns
-------
Path to audio file if output_path specified, else None.
Examples
--------
import scitex
# Simple speak
scitex.audio.speak("Hello!")
# With specific voice
scitex.audio.speak("Processing complete", voice="Adam")
# Save without playing
scitex.audio.speak("Test", play=False, output_path="/tmp/test.mp3")
"""
global _default_tts
if _default_tts is None or kwargs:
_default_tts = TTS(**kwargs)
return _default_tts.speak(
text=text,
voice_name=voice,
play=play,
output_path=output_path,
)
# EOF