Source code for scitex_audio._engines._luxtts_engine

#!/usr/bin/env python3
# Timestamp: "2026-03-14 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex-python/src/scitex/audio/engines/_luxtts_engine.py
# ----------------------------------------

"""
LuxTTS backend - Open-source, offline, voice-cloning TTS.

Uses the ZipVoice/LuxTTS model from HuggingFace.
Supports CPU, CUDA, and MPS devices.
48kHz output, near-realtime on CPU, 150x+ on GPU.

Install:
    pip install git+https://github.com/ysharma3501/LuxTTS.git
"""

from __future__ import annotations

import os
import threading
from pathlib import Path
from typing import List, Optional

from ._base import BaseTTS

__all__ = ["LuxTTS"]

_lock = threading.Lock()
_cached_model = None
_cached_prompt = None
_cached_ref_path = None


def _get_model(device: str = "cpu", model_id: str = "YatharthS/LuxTTS"):
    """Get or create cached LuxTTS model (singleton)."""
    global _cached_model
    with _lock:
        if _cached_model is None:
            from zipvoice.luxvoice import LuxTTS as _LuxTTSModel

            _cached_model = _LuxTTSModel(model_id, device=device)
        return _cached_model


def _get_encoded_prompt(model, ref_path: str, duration: float = 5.0, rms: float = 0.01):
    """Get or create cached encoded prompt for a reference audio."""
    global _cached_prompt, _cached_ref_path
    with _lock:
        if _cached_prompt is None or _cached_ref_path != ref_path:
            # Suppress Whisper's hallucinated transcription log
            import contextlib
            import io

            with (
                contextlib.redirect_stdout(io.StringIO()),
                contextlib.redirect_stderr(io.StringIO()),
            ):
                _cached_prompt = model.encode_prompt(
                    ref_path, duration=duration, rms=rms
                )
            _cached_ref_path = ref_path
        return _cached_prompt


[docs] class LuxTTS(BaseTTS): """LuxTTS backend - open-source voice-cloning TTS. High-quality 48kHz output. Near-realtime on CPU, 150x+ on GPU. Requires a reference audio file for voice cloning. Install: pip install git+https://github.com/ysharma3501/LuxTTS.git """ # Default reference audio search paths _DEFAULT_REF_DIRS = [ "~/.config/scitex/audio/reference", "~/.scitex/audio/reference", ] def __init__( self, device: Optional[str] = None, model_id: str = "YatharthS/LuxTTS", reference_audio: Optional[str] = None, num_steps: int = 4, speed: float = 2.0, rms: float = 0.01, t_shift: float = 0.9, return_smooth: bool = False, ref_duration: float = 5.0, trim_start: Optional[float] = None, **kwargs, ): super().__init__(**kwargs) self._device = device or self._detect_device() self._model_id = model_id self._reference_audio = reference_audio self.num_steps = num_steps self.speed = speed self.rms = rms self.t_shift = t_shift self.return_smooth = return_smooth self.ref_duration = ref_duration self._trim_start = trim_start or float( os.environ.get("SCITEX_AUDIO_LUXTTS_TRIM_START", "0") ) @staticmethod def _detect_device() -> str: """Auto-detect best available device.""" try: import torch if torch.cuda.is_available(): return "cuda" if hasattr(torch.backends, "mps") and torch.backends.mps.is_available(): return "mps" except ImportError: pass return "cpu" @property def name(self) -> str: return "luxtts" @property def requires_internet(self) -> bool: # Only for initial model download from HuggingFace return False def _find_reference_audio(self) -> Optional[str]: """Find a reference audio file from configured paths.""" # Explicit config if self._reference_audio: path = Path(self._reference_audio).expanduser() if path.exists(): return str(path) # Environment variable env_ref = os.environ.get("SCITEX_AUDIO_LUXTTS_REFERENCE") if env_ref: path = Path(env_ref).expanduser() if path.exists(): return str(path) # Search default directories for dir_path in self._DEFAULT_REF_DIRS: d = Path(dir_path).expanduser() if d.is_dir(): for ext in ("*.wav", "*.mp3", "*.flac", "*.ogg"): files = sorted(d.glob(ext)) if files: return str(files[0]) return None def _create_default_reference(self) -> str: """Create a minimal reference audio when none is provided.""" import numpy as np import soundfile as sf ref_dir = Path("~/.config/scitex/audio/reference").expanduser() ref_dir.mkdir(parents=True, exist_ok=True) ref_path = ref_dir / "default_ref.wav" if not ref_path.exists(): sr = 16000 duration = 3 audio = np.random.randn(sr * duration).astype(np.float32) * 0.01 sf.write(str(ref_path), audio, sr) return str(ref_path)
[docs] def synthesize(self, text: str, output_path: str) -> Path: """Synthesize text using LuxTTS.""" try: from zipvoice.luxvoice import LuxTTS as _LuxTTSModel # noqa: F401 except ImportError: raise ImportError( "LuxTTS (zipvoice) not installed. Install with:\n" " pip install git+https://github.com/ysharma3501/LuxTTS.git" ) import soundfile as sf # Get model (cached singleton) model = _get_model(device=self._device, model_id=self._model_id) # Find or create reference audio ref_path = self._find_reference_audio() if ref_path is None: ref_path = self._create_default_reference() # Encode prompt (cached per reference audio) encoded = _get_encoded_prompt( model, ref_path, duration=self.ref_duration, rms=self.rms ) # Generate speech speed = self.config.get("speed", self.speed) audio = model.generate_speech( text, encoded, num_steps=self.num_steps, t_shift=self.t_shift, speed=speed, return_smooth=self.return_smooth, ) # Save as WAV (48kHz) out_path = Path(output_path) wav = audio.cpu().numpy() if wav.ndim == 2: wav = wav[0] # Trim hallucinated preamble from start (seconds) if self._trim_start > 0: trim_samples = int(self._trim_start * 48000) if trim_samples < len(wav): wav = wav[trim_samples:] # LuxTTS outputs WAV at 48kHz if out_path.suffix.lower() in (".mp3", ".ogg"): # Save as WAV first, then convert wav_tmp = out_path.with_suffix(".wav") sf.write(str(wav_tmp), wav, 48000) try: from pydub import AudioSegment sound = AudioSegment.from_wav(str(wav_tmp)) sound.export(str(out_path), format=out_path.suffix.lstrip(".")) wav_tmp.unlink() except ImportError: # No pydub — save as WAV instead out_path = wav_tmp else: sf.write(str(out_path), wav, 48000) return out_path
[docs] def speak( self, text: str, output_path: Optional[str] = None, play: bool = True, voice: Optional[str] = None, ) -> dict: """Synthesize and optionally play. Uses .wav temp files (not .mp3).""" import tempfile if output_path: out_path = Path(output_path) else: fd, tmp_path = tempfile.mkstemp(suffix=".wav", prefix="scitex_tts_") os.close(fd) out_path = Path(tmp_path) if voice: self.config["voice"] = voice result_path = self.synthesize(text, str(out_path)) played = False if play: played = self._play_audio(result_path) result = {"success": True, "played": played, "play_requested": play} if output_path: result["path"] = result_path return result
[docs] def get_voices(self) -> List[dict]: """Get available voices (reference audio files).""" voices = [] for dir_path in self._DEFAULT_REF_DIRS: d = Path(dir_path).expanduser() if d.is_dir(): for f in sorted(d.iterdir()): if f.suffix.lower() in (".wav", ".mp3", ".flac", ".ogg"): voices.append( { "name": f.stem, "id": str(f), "type": "reference_audio", } ) if not voices: voices.append( { "name": "default", "id": "default", "type": "generated", } ) return voices
# EOF