Source code for scitex_audio._stt

#!/usr/bin/env python3
# Timestamp: "2026-03-26 (ywatanabe)"
# File: scitex-audio/src/scitex_audio/_stt.py

"""
Speech-to-Text transcription using whisper.cpp.

Pipeline: audio file -> ffmpeg convert to WAV -> whisper-cli transcribe

Backends (discovery order):
    1. whisper-cli in PATH
    2. ~/.emacs.d/.cache/whisper.cpp/build/bin/whisper-cli
    3. Custom path via SCITEX_AUDIO_WHISPER_CLI env var

Models (discovery order):
    1. SCITEX_AUDIO_WHISPER_MODEL env var
    2. ~/.emacs.d/.cache/whisper.cpp/models/ggml-{model}.bin
    3. ~/.local/share/whisper-cli/ggml-{model}.bin
"""

from __future__ import annotations

import os
import re
import shutil
import subprocess
import tempfile
from pathlib import Path
from typing import Optional

__all__ = ["transcribe", "find_whisper_cli", "find_whisper_model"]

# Default model — tiny is fast enough for interactive use
DEFAULT_MODEL = "tiny"

# Model search directories
_MODEL_DIRS = [
    Path.home() / ".emacs.d" / ".cache" / "whisper.cpp" / "models",
    Path.home() / ".local" / "share" / "whisper-cli",
    Path.home() / ".local" / "share" / "whisper.cpp" / "models",
]

# whisper-cli search paths
_CLI_PATHS = [
    Path.home()
    / ".emacs.d"
    / ".cache"
    / "whisper.cpp"
    / "build"
    / "bin"
    / "whisper-cli",
]



[docs]
def find_whisper_cli() -> Optional[str]:
    """Find whisper-cli binary.

    Returns
    -------
        Path to whisper-cli, or None if not found.
    """
    # 1. Environment variable override
    env_path = os.environ.get("SCITEX_AUDIO_WHISPER_CLI")
    if env_path and Path(env_path).is_file():
        return env_path

    # 2. PATH lookup
    which = shutil.which("whisper-cli")
    if which:
        return which

    # 3. Known locations
    for p in _CLI_PATHS:
        if p.is_file():
            return str(p)

    return None




[docs]
def find_whisper_model(model: str = DEFAULT_MODEL) -> Optional[str]:
    """Find a whisper model file.

    Args:
        model: Model name (tiny, base, small, medium, large-v3-turbo, etc.)

    Returns
    -------
        Path to model file, or None if not found.
    """
    # 1. Environment variable override
    env_path = os.environ.get("SCITEX_AUDIO_WHISPER_MODEL")
    if env_path and Path(env_path).is_file():
        return env_path

    # 2. Search known directories
    filename = f"ggml-{model}.bin"
    for d in _MODEL_DIRS:
        candidate = d / filename
        if candidate.is_file():
            return str(candidate)

    return None




[docs]
def available_models() -> list[str]:
    """List available whisper models.

    Returns
    -------
        List of model names (e.g., ["tiny", "base", "medium"]).
    """
    models = set()
    pattern = re.compile(r"^ggml-(.+)\.bin$")
    for d in _MODEL_DIRS:
        if d.is_dir():
            for f in d.iterdir():
                m = pattern.match(f.name)
                if m and not f.name.startswith("for-tests-"):
                    models.add(m.group(1))
    return sorted(models)



def _convert_to_wav(input_path: str, output_path: str) -> None:
    """Convert audio file to WAV format using ffmpeg.

    Args:
        input_path: Source audio file.
        output_path: Destination WAV file.

    Raises
    ------
        RuntimeError: If ffmpeg is not available or conversion fails.
    """
    ffmpeg = shutil.which("ffmpeg")
    if not ffmpeg:
        raise RuntimeError("ffmpeg not found. Install with: sudo apt install ffmpeg")

    result = subprocess.run(
        [ffmpeg, "-y", "-i", input_path, output_path],
        capture_output=True,
        text=True,
        timeout=30,
    )
    if result.returncode != 0:
        raise RuntimeError(f"ffmpeg conversion failed: {result.stderr}")


def _parse_whisper_output(stdout: str) -> list[dict]:
    """Parse whisper-cli output into segments.

    Args:
        stdout: Raw whisper-cli stdout.

    Returns
    -------
        List of dicts with keys: start, end, text.
    """
    segments = []
    pattern = re.compile(
        r"\[(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}\.\d{3})\]\s*(.*)"
    )
    for line in stdout.strip().splitlines():
        m = pattern.match(line.strip())
        if m:
            segments.append(
                {
                    "start": m.group(1),
                    "end": m.group(2),
                    "text": m.group(3).strip(),
                }
            )
    return segments



[docs]
def transcribe(
    audio_path: str,
    language: Optional[str] = "ja",
    model: str = DEFAULT_MODEL,
    whisper_cli: Optional[str] = None,
    model_path: Optional[str] = None,
) -> dict:
    """Transcribe audio file to text using whisper.cpp.

    Args:
        audio_path: Path to audio file (any format ffmpeg supports).
        language: Language code (e.g., "ja", "en"). None for auto-detect.
        model: Whisper model name (tiny, base, small, medium, large-v3-turbo).
        whisper_cli: Override path to whisper-cli binary.
        model_path: Override path to model file.

    Returns
    -------
        Dict with keys: success, text, segments, language, model, audio_path.
    """
    audio_path = str(Path(audio_path).resolve())

    # Find whisper-cli
    cli = whisper_cli or find_whisper_cli()
    if not cli:
        return {
            "success": False,
            "error": (
                "whisper-cli not found. Install whisper.cpp or set "
                "SCITEX_AUDIO_WHISPER_CLI environment variable."
            ),
        }

    # Find model
    mdl = model_path or find_whisper_model(model)
    if not mdl:
        return {
            "success": False,
            "error": (
                f"Whisper model '{model}' not found. Download with: "
                f"cd ~/.emacs.d/.cache/whisper.cpp/models && "
                f"bash download-ggml-model.sh {model}"
            ),
        }

    # Convert to WAV if needed
    wav_path = audio_path
    tmp_wav = None
    if not audio_path.lower().endswith(".wav"):
        fd, tmp_wav = tempfile.mkstemp(suffix=".wav", prefix="scitex_stt_")
        os.close(fd)
        wav_path = tmp_wav
        try:
            _convert_to_wav(audio_path, wav_path)
        except RuntimeError as e:
            if tmp_wav:
                _safe_unlink(tmp_wav)
            return {"success": False, "error": str(e)}

    try:
        # Build whisper-cli command
        cmd = [cli, "-m", mdl, "-f", wav_path]
        if language:
            cmd.extend(["-l", language])

        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=120,
        )

        if result.returncode != 0:
            return {
                "success": False,
                "error": f"whisper-cli failed (exit {result.returncode}): {result.stderr}",
            }

        # Parse output
        segments = _parse_whisper_output(result.stdout)
        full_text = " ".join(s["text"] for s in segments)

        return {
            "success": True,
            "text": full_text,
            "segments": segments,
            "language": language,
            "model": model,
            "audio_path": audio_path,
        }

    except subprocess.TimeoutExpired:
        return {
            "success": False,
            "error": "Transcription timed out (120s limit)",
        }
    finally:
        if tmp_wav:
            _safe_unlink(tmp_wav)



def _safe_unlink(path: str) -> None:
    """Remove file, ignoring errors."""
    try:
        os.unlink(path)
    except Exception:
        pass


# EOF