#!/usr/bin/env python3
# Timestamp: "2026-03-14 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex-python/src/scitex/audio/engines/_luxtts_engine.py
# ----------------------------------------
"""
LuxTTS backend - Open-source, offline, voice-cloning TTS.
Uses the ZipVoice/LuxTTS model from HuggingFace.
Supports CPU, CUDA, and MPS devices.
48kHz output, near-realtime on CPU, 150x+ on GPU.
Install:
pip install git+https://github.com/ysharma3501/LuxTTS.git
"""
from __future__ import annotations
import os
import threading
from pathlib import Path
from typing import List, Optional
from ._base import BaseTTS
__all__ = ["LuxTTS"]
_lock = threading.Lock()
_cached_model = None
_cached_prompt = None
_cached_ref_path = None
def _get_model(device: str = "cpu", model_id: str = "YatharthS/LuxTTS"):
"""Get or create cached LuxTTS model (singleton)."""
global _cached_model
with _lock:
if _cached_model is None:
from zipvoice.luxvoice import LuxTTS as _LuxTTSModel
_cached_model = _LuxTTSModel(model_id, device=device)
return _cached_model
def _get_encoded_prompt(model, ref_path: str, duration: float = 5.0, rms: float = 0.01):
"""Get or create cached encoded prompt for a reference audio."""
global _cached_prompt, _cached_ref_path
with _lock:
if _cached_prompt is None or _cached_ref_path != ref_path:
# Suppress Whisper's hallucinated transcription log
import contextlib
import io
with (
contextlib.redirect_stdout(io.StringIO()),
contextlib.redirect_stderr(io.StringIO()),
):
_cached_prompt = model.encode_prompt(
ref_path, duration=duration, rms=rms
)
_cached_ref_path = ref_path
return _cached_prompt
[docs]
class LuxTTS(BaseTTS):
"""LuxTTS backend - open-source voice-cloning TTS.
High-quality 48kHz output. Near-realtime on CPU, 150x+ on GPU.
Requires a reference audio file for voice cloning.
Install: pip install git+https://github.com/ysharma3501/LuxTTS.git
"""
# Default reference audio search paths
_DEFAULT_REF_DIRS = [
"~/.config/scitex/audio/reference",
"~/.scitex/audio/reference",
]
def __init__(
self,
device: Optional[str] = None,
model_id: str = "YatharthS/LuxTTS",
reference_audio: Optional[str] = None,
num_steps: int = 4,
speed: float = 2.0,
rms: float = 0.01,
t_shift: float = 0.9,
return_smooth: bool = False,
ref_duration: float = 5.0,
trim_start: Optional[float] = None,
**kwargs,
):
super().__init__(**kwargs)
self._device = device or self._detect_device()
self._model_id = model_id
self._reference_audio = reference_audio
self.num_steps = num_steps
self.speed = speed
self.rms = rms
self.t_shift = t_shift
self.return_smooth = return_smooth
self.ref_duration = ref_duration
self._trim_start = trim_start or float(
os.environ.get("SCITEX_AUDIO_LUXTTS_TRIM_START", "0")
)
@staticmethod
def _detect_device() -> str:
"""Auto-detect best available device."""
try:
import torch
if torch.cuda.is_available():
return "cuda"
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
return "mps"
except ImportError:
pass
return "cpu"
@property
def name(self) -> str:
return "luxtts"
@property
def requires_internet(self) -> bool:
# Only for initial model download from HuggingFace
return False
def _find_reference_audio(self) -> Optional[str]:
"""Find a reference audio file from configured paths."""
# Explicit config
if self._reference_audio:
path = Path(self._reference_audio).expanduser()
if path.exists():
return str(path)
# Environment variable
env_ref = os.environ.get("SCITEX_AUDIO_LUXTTS_REFERENCE")
if env_ref:
path = Path(env_ref).expanduser()
if path.exists():
return str(path)
# Search default directories
for dir_path in self._DEFAULT_REF_DIRS:
d = Path(dir_path).expanduser()
if d.is_dir():
for ext in ("*.wav", "*.mp3", "*.flac", "*.ogg"):
files = sorted(d.glob(ext))
if files:
return str(files[0])
return None
def _create_default_reference(self) -> str:
"""Create a minimal reference audio when none is provided."""
import numpy as np
import soundfile as sf
ref_dir = Path("~/.config/scitex/audio/reference").expanduser()
ref_dir.mkdir(parents=True, exist_ok=True)
ref_path = ref_dir / "default_ref.wav"
if not ref_path.exists():
sr = 16000
duration = 3
audio = np.random.randn(sr * duration).astype(np.float32) * 0.01
sf.write(str(ref_path), audio, sr)
return str(ref_path)
[docs]
def synthesize(self, text: str, output_path: str) -> Path:
"""Synthesize text using LuxTTS."""
try:
from zipvoice.luxvoice import LuxTTS as _LuxTTSModel # noqa: F401
except ImportError:
raise ImportError(
"LuxTTS (zipvoice) not installed. Install with:\n"
" pip install git+https://github.com/ysharma3501/LuxTTS.git"
)
import soundfile as sf
# Get model (cached singleton)
model = _get_model(device=self._device, model_id=self._model_id)
# Find or create reference audio
ref_path = self._find_reference_audio()
if ref_path is None:
ref_path = self._create_default_reference()
# Encode prompt (cached per reference audio)
encoded = _get_encoded_prompt(
model, ref_path, duration=self.ref_duration, rms=self.rms
)
# Generate speech
speed = self.config.get("speed", self.speed)
audio = model.generate_speech(
text,
encoded,
num_steps=self.num_steps,
t_shift=self.t_shift,
speed=speed,
return_smooth=self.return_smooth,
)
# Save as WAV (48kHz)
out_path = Path(output_path)
wav = audio.cpu().numpy()
if wav.ndim == 2:
wav = wav[0]
# Trim hallucinated preamble from start (seconds)
if self._trim_start > 0:
trim_samples = int(self._trim_start * 48000)
if trim_samples < len(wav):
wav = wav[trim_samples:]
# LuxTTS outputs WAV at 48kHz
if out_path.suffix.lower() in (".mp3", ".ogg"):
# Save as WAV first, then convert
wav_tmp = out_path.with_suffix(".wav")
sf.write(str(wav_tmp), wav, 48000)
try:
from pydub import AudioSegment
sound = AudioSegment.from_wav(str(wav_tmp))
sound.export(str(out_path), format=out_path.suffix.lstrip("."))
wav_tmp.unlink()
except ImportError:
# No pydub — save as WAV instead
out_path = wav_tmp
else:
sf.write(str(out_path), wav, 48000)
return out_path
[docs]
def speak(
self,
text: str,
output_path: Optional[str] = None,
play: bool = True,
voice: Optional[str] = None,
) -> dict:
"""Synthesize and optionally play. Uses .wav temp files (not .mp3)."""
import tempfile
if output_path:
out_path = Path(output_path)
else:
fd, tmp_path = tempfile.mkstemp(suffix=".wav", prefix="scitex_tts_")
os.close(fd)
out_path = Path(tmp_path)
if voice:
self.config["voice"] = voice
result_path = self.synthesize(text, str(out_path))
played = False
if play:
played = self._play_audio(result_path)
result = {"success": True, "played": played, "play_requested": play}
if output_path:
result["path"] = result_path
return result
[docs]
def get_voices(self) -> List[dict]:
"""Get available voices (reference audio files)."""
voices = []
for dir_path in self._DEFAULT_REF_DIRS:
d = Path(dir_path).expanduser()
if d.is_dir():
for f in sorted(d.iterdir()):
if f.suffix.lower() in (".wav", ".mp3", ".flac", ".ogg"):
voices.append(
{
"name": f.stem,
"id": str(f),
"type": "reference_audio",
}
)
if not voices:
voices.append(
{
"name": "default",
"id": "default",
"type": "generated",
}
)
return voices
# EOF