Source code for scitex_audio._tts

#!/usr/bin/env python3
# Timestamp: "2025-12-11 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex-code/src/scitex/audio/_tts.py
# ----------------------------------------

"""
Text-to-Speech implementation using ElevenLabs API.

This module provides TTS functionality that can be used:
1. Directly via the ElevenLabs Python SDK
2. Via MCP server integration

Environment Variables:
    ELEVENLABS_API_KEY: Your ElevenLabs API key
"""

from __future__ import annotations

import os
import subprocess
import tempfile
from dataclasses import dataclass
from pathlib import Path
from typing import Optional

__all__ = ["TTS", "speak"]


@dataclass
class TTSConfig:
    """Configuration for TTS."""

    voice_id: str = "pNInz6obpgDQGcFmaJgB"  # Adam (default; free-tier compatible)
    voice_name: Optional[str] = None
    model_id: str = "eleven_multilingual_v2"
    stability: float = 0.5
    similarity_boost: float = 0.75
    style: float = 0.0
    speed: float = 1.0
    output_format: str = "mp3_44100_128"



[docs]
class TTS:
    """Text-to-Speech using ElevenLabs API.

    Examples
    --------
        # Basic usage
        tts = TTS()
        tts.speak("Hello, world!")

        # With custom voice
        tts = TTS(voice_name="Adam")
        tts.speak("Processing complete")

        # Save to file without playing
        tts.speak("Test", output_path="/tmp/test.mp3", play=False)
    """

    # Popular voice presets (free-tier premade voices first, paid library voices at end)
    VOICES = {
        "adam": "pNInz6obpgDQGcFmaJgB",
        "sarah": "EXAVITQu4vr4xnSDxMaL",
        "laura": "FGY2WhTYpPnrIDTdsKH5",
        "charlie": "IKne3meq5aSn9XLyUdCD",
        "george": "JBFqnCBsd6RMkjVDRZzb",
        "callum": "N2lVS1w4EtoT3dr4eOWO",
        "river": "SAz9YHcvj6GT2YYXdXww",
        "liam": "TX3LPaxmHKxFdv7VOQHJ",
        "alice": "Xb7hH8MSUJpSbSDYk0k2",
        "matilda": "XrExE9yKIg1WjnnlVkGX",
        "will": "bIHbv24MWmeRgasZH58o",
        "jessica": "cgSgspJ2msm6clMCkdW9",
        "eric": "cjVigY5qzO86Huf0OWal",
        "bella": "hpp4J3VqNfWAUOO0d1Us",
        "chris": "iP95p4xoKVk53GoZ742B",
        "brian": "nPczCjzI2devNBz1zQrb",
        "daniel": "onwK4e9ZLuTAKqWW03F9",
        "lily": "pFZP5JQG7iQjIQuC4Bku",
        "roger": "CwhRBWXzGAHq8TQ4Fs17",
        "harry": "SOYHLrjzK2X1ezoPC6cr",
        "rachel": "21m00Tcm4TlvDq8ikWAM",
        "antoni": "ErXwobaYiN019PkySvjV",
        "domi": "AZnzlk1XvdvUeBnXmlld",
        "elli": "MF3mGyEYCl7XYWbV9V6O",
        "josh": "TxGEqnHWrfWFTfGW9XjX",
        "sam": "yoZ06aMxZJJ28mfd3POQ",
    }


[docs]
    def __init__(
        self,
        api_key: Optional[str] = None,
        voice_name: Optional[str] = None,
        voice_id: Optional[str] = None,
        client=None,
        client_factory=None,
        **kwargs,
    ):
        """Initialize TTS.

        Args:
            api_key: ElevenLabs API key. Defaults to ELEVENLABS_API_KEY env var.
            voice_name: Voice name (e.g., "Adam", "Sarah", "George" — free-tier).
            voice_id: Direct voice ID (overrides voice_name).
            client: Optional pre-built client (testing). When given, the
                lazy-load is skipped.
            client_factory: Optional callable ``(api_key) -> client`` used by
                the lazy ``client`` property instead of the real ElevenLabs
                SDK (testing). Lets a test exercise the import-error path
                without uninstalling the dependency.
            **kwargs: Additional config options (stability, speed, etc.)
        """
        self.api_key = (
            api_key
            or os.environ.get("SCITEX_AUDIO_ELEVENLABS_API_KEY")
            or os.environ.get("ELEVENLABS_API_KEY")
        )
        self.config = TTSConfig(**kwargs)

        if voice_id:
            self.config.voice_id = voice_id
        elif voice_name:
            self.config.voice_name = voice_name
            normalized = voice_name.lower()
            if normalized in self.VOICES:
                self.config.voice_id = self.VOICES[normalized]

        self._client = client
        self._client_factory = client_factory


    @property
    def client(self):
        """Lazy-load ElevenLabs client."""
        if self._client is None:
            if self._client_factory is not None:
                self._client = self._client_factory(self.api_key)
            else:
                try:
                    from elevenlabs.client import ElevenLabs

                    self._client = ElevenLabs(api_key=self.api_key)
                except ImportError:
                    raise ImportError(
                        "elevenlabs package not installed. "
                        "Install with: pip install elevenlabs"
                    )
        return self._client


[docs]
    def speak(
        self,
        text: str,
        output_path: Optional[str] = None,
        play: bool = True,
        voice_name: Optional[str] = None,
        voice_id: Optional[str] = None,
    ) -> Optional[Path]:
        """Convert text to speech and optionally play it.

        Args:
            text: Text to convert to speech.
            output_path: Path to save audio file. Auto-generated if None.
            play: Whether to play the audio after generation.
            voice_name: Override voice name for this call.
            voice_id: Override voice ID for this call.

        Returns
        -------
            Path to the generated audio file, or None if only played.
        """
        # Determine voice
        vid = voice_id or self.config.voice_id
        if voice_name and not voice_id:
            normalized = voice_name.lower()
            vid = self.VOICES.get(normalized, vid)

        # Generate audio
        audio = self.client.text_to_speech.convert(
            text=text,
            voice_id=vid,
            model_id=self.config.model_id,
            voice_settings={
                "stability": self.config.stability,
                "similarity_boost": self.config.similarity_boost,
                "style": self.config.style,
                "speed": self.config.speed,
            },
            output_format=self.config.output_format,
        )

        # Determine output path
        if output_path:
            out_path = Path(output_path)
        else:
            suffix = ".mp3" if "mp3" in self.config.output_format else ".wav"
            fd, tmp_path = tempfile.mkstemp(suffix=suffix, prefix="scitex_tts_")
            os.close(fd)
            out_path = Path(tmp_path)

        # Write audio to file
        with open(out_path, "wb") as f:
            for chunk in audio:
                f.write(chunk)

        # Play if requested
        if play:
            self._play_audio(out_path)

        return out_path if output_path else None


    def _play_audio(self, path: Path, runner=None) -> None:
        """Play audio file using available system player.

        Includes Windows fallback for WSL environments.

        Args:
            path: Audio file to play.
            runner: Injectable subprocess runner (testing). A callable with
                the ``subprocess.run`` signature; defaults to the real
                ``subprocess.run`` when ``None``.
        """
        run = runner if runner is not None else subprocess.run

        # Check if we're in WSL - if so, prefer Windows playback directly
        # to avoid double playback issues with Linux audio hanging
        if os.path.exists("/mnt/c/Windows"):
            if self._play_audio_windows(path):
                return
            # Fall through to Linux players if Windows playback fails

        players = [
            ["mpv", "--no-video", str(path)],
            ["ffplay", "-nodisp", "-autoexit", str(path)],
            ["aplay", str(path)],
            ["afplay", str(path)],  # macOS
        ]

        for player_cmd in players:
            try:
                run(
                    player_cmd,
                    check=True,
                    stdout=subprocess.DEVNULL,
                    stderr=subprocess.DEVNULL,
                    timeout=30,
                )
                return
            except subprocess.TimeoutExpired:
                # Audio playback hung, don't try more players
                return
            except (subprocess.CalledProcessError, FileNotFoundError):
                continue

        print(f"Warning: No audio player found. Audio saved to: {path}")

    def _play_audio_windows(self, path: Path) -> bool:
        """Play audio via Windows PowerShell SoundPlayer (WSL fallback).

        Uses headless SoundPlayer - no GUI popup.
        """
        import shutil
        import tempfile

        # Check if we're in WSL
        if not os.path.exists("/mnt/c/Windows"):
            return False

        powershell = shutil.which("powershell.exe")
        if not powershell:
            return False

        try:
            # SoundPlayer only supports WAV, so convert if needed
            wav_path = path
            if path.suffix.lower() in (".mp3", ".ogg", ".m4a"):
                try:
                    from pydub import AudioSegment

                    fd, tmp_wav = tempfile.mkstemp(suffix=".wav", prefix="scitex_")
                    os.close(fd)
                    wav_path = Path(tmp_wav)
                    audio = AudioSegment.from_file(str(path))
                    audio.export(str(wav_path), format="wav")
                except ImportError:
                    pass

            result = subprocess.run(
                ["wslpath", "-w", str(wav_path)],
                capture_output=True,
                text=True,
                timeout=5,
            )
            if result.returncode != 0:
                return False

            windows_path = result.stdout.strip()

            ps_command = f"""
$player = New-Object System.Media.SoundPlayer
$player.SoundLocation = "{windows_path}"
$player.PlaySync()
"""
            subprocess.run(
                [powershell, "-NoProfile", "-Command", ps_command],
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL,
                timeout=60,
            )

            # Clean up temp WAV
            if wav_path != path and wav_path.exists():
                try:
                    wav_path.unlink()
                except Exception:
                    pass

            return True

        except Exception:
            return False


[docs]
    def list_voices(self) -> list:
        """List available voices from ElevenLabs."""
        response = self.client.voices.get_all()
        return [
            {"name": v.name, "voice_id": v.voice_id, "labels": v.labels}
            for v in response.voices
        ]




# Module-level convenience function
_default_tts: Optional[TTS] = None


def speak(
    text: str,
    voice: Optional[str] = None,
    play: bool = True,
    output_path: Optional[str] = None,
    **kwargs,
) -> Optional[Path]:
    """Convenience function for quick TTS.

    Args:
        text: Text to speak.
        voice: Voice name (e.g., "Rachel", "Adam").
        play: Whether to play audio.
        output_path: Optional path to save audio.
        **kwargs: Additional TTS config options.

    Returns
    -------
        Path to audio file if output_path specified, else None.

    Examples
    --------
        import scitex

        # Simple speak
        scitex.audio.speak("Hello!")

        # With specific voice
        scitex.audio.speak("Processing complete", voice="Adam")

        # Save without playing
        scitex.audio.speak("Test", play=False, output_path="/tmp/test.mp3")
    """
    global _default_tts

    if _default_tts is None or kwargs:
        _default_tts = TTS(**kwargs)

    return _default_tts.speak(
        text=text,
        voice_name=voice,
        play=play,
        output_path=output_path,
    )


# EOF