Source code for scitex_audio._engines._elevenlabs_engine

#!/usr/bin/env python3
# Timestamp: "2025-12-11 (ywatanabe)"
# File: /home/ywatanabe/proj/scitex-code/src/scitex/audio/engines/elevenlabs_engine.py
# ----------------------------------------

"""
ElevenLabs TTS backend - High quality, requires API key and payment.
"""

from __future__ import annotations

import os
from pathlib import Path
from typing import List, Optional

from ._base import BaseTTS

__all__ = ["ElevenLabsTTS"]



[docs]
class ElevenLabsTTS(BaseTTS):
    """ElevenLabs TTS backend.

    High-quality voices but requires API key and has usage costs.

    Environment:
        ELEVENLABS_API_KEY: Your ElevenLabs API key
    """

    VOICES = {
        "adam": "pNInz6obpgDQGcFmaJgB",
        "sarah": "EXAVITQu4vr4xnSDxMaL",
        "laura": "FGY2WhTYpPnrIDTdsKH5",
        "charlie": "IKne3meq5aSn9XLyUdCD",
        "george": "JBFqnCBsd6RMkjVDRZzb",
        "callum": "N2lVS1w4EtoT3dr4eOWO",
        "river": "SAz9YHcvj6GT2YYXdXww",
        "liam": "TX3LPaxmHKxFdv7VOQHJ",
        "alice": "Xb7hH8MSUJpSbSDYk0k2",
        "matilda": "XrExE9yKIg1WjnnlVkGX",
        "will": "bIHbv24MWmeRgasZH58o",
        "jessica": "cgSgspJ2msm6clMCkdW9",
        "eric": "cjVigY5qzO86Huf0OWal",
        "bella": "hpp4J3VqNfWAUOO0d1Us",
        "chris": "iP95p4xoKVk53GoZ742B",
        "brian": "nPczCjzI2devNBz1zQrb",
        "daniel": "onwK4e9ZLuTAKqWW03F9",
        "lily": "pFZP5JQG7iQjIQuC4Bku",
        "roger": "CwhRBWXzGAHq8TQ4Fs17",
        "harry": "SOYHLrjzK2X1ezoPC6cr",
        "rachel": "21m00Tcm4TlvDq8ikWAM",
        "antoni": "ErXwobaYiN019PkySvjV",
        "domi": "AZnzlk1XvdvUeBnXmlld",
        "elli": "MF3mGyEYCl7XYWbV9V6O",
        "josh": "TxGEqnHWrfWFTfGW9XjX",
        "sam": "yoZ06aMxZJJ28mfd3POQ",
    }

    # ElevenLabs API speed limits
    MIN_SPEED = 0.7
    MAX_SPEED = 1.2

    def __init__(
        self,
        api_key: Optional[str] = None,
        voice: str = "adam",
        model_id: str = "eleven_multilingual_v2",
        stability: float = 0.5,
        similarity_boost: float = 0.75,
        speed: float = 1.0,
        client=None,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.api_key = (
            api_key
            or os.environ.get("SCITEX_AUDIO_ELEVENLABS_API_KEY")
            or os.environ.get("ELEVENLABS_API_KEY")
        )
        self.voice = voice
        self.model_id = model_id
        self.stability = stability
        self.similarity_boost = similarity_boost
        # Clamp speed to ElevenLabs API limits (0.7-1.2)
        self.speed = max(self.MIN_SPEED, min(self.MAX_SPEED, speed))
        # Optional injected client (testing / custom transport). When None,
        # the real ElevenLabs SDK is lazy-loaded on first `client` access.
        self._client = client

    @property
    def name(self) -> str:
        return "elevenlabs"

    @property
    def requires_api_key(self) -> bool:
        return True

    @property
    def requires_internet(self) -> bool:
        return True

    @property
    def client(self):
        """Lazy-load ElevenLabs client."""
        if self._client is None:
            try:
                from elevenlabs.client import ElevenLabs

                self._client = ElevenLabs(api_key=self.api_key)
            except ImportError:
                raise ImportError(
                    "elevenlabs package not installed. "
                    "Install with: pip install elevenlabs"
                )
        return self._client

    def _get_voice_id(self, voice: Optional[str] = None) -> str:
        """Get voice ID from name or return as-is if already an ID."""
        v = voice or self.voice
        normalized = v.lower()
        return self.VOICES.get(normalized, v)


[docs]
    def synthesize(self, text: str, output_path: str) -> Path:
        """Synthesize text using ElevenLabs API."""
        voice_id = self._get_voice_id(self.config.get("voice"))

        audio = self.client.text_to_speech.convert(
            text=text,
            voice_id=voice_id,
            model_id=self.model_id,
            voice_settings={
                "stability": self.stability,
                "similarity_boost": self.similarity_boost,
                "speed": self.speed,
            },
            output_format="mp3_44100_128",
        )

        out_path = Path(output_path)
        with open(out_path, "wb") as f:
            for chunk in audio:
                f.write(chunk)

        return out_path



[docs]
    def get_voices(self) -> List[dict]:
        """Get available voices."""
        # Start with preset voices
        voices = [
            {"name": name, "id": vid, "type": "preset"}
            for name, vid in self.VOICES.items()
        ]

        # Try to get custom voices
        try:
            response = self.client.voices.get_all()
            for v in response.voices:
                voices.append(
                    {
                        "name": v.name,
                        "id": v.voice_id,
                        "type": "custom",
                        "labels": v.labels,
                    }
                )
        except Exception:
            pass

        return voices




# EOF