#!/usr/bin/env python3
# Timestamp: "2026-03-26 (ywatanabe)"
# File: scitex-audio/src/scitex_audio/_stt.py
"""
Speech-to-Text transcription using whisper.cpp.
Pipeline: audio file -> ffmpeg convert to WAV -> whisper-cli transcribe
Backends (discovery order):
1. whisper-cli in PATH
2. ~/.emacs.d/.cache/whisper.cpp/build/bin/whisper-cli
3. Custom path via SCITEX_AUDIO_WHISPER_CLI env var
Models (discovery order):
1. SCITEX_AUDIO_WHISPER_MODEL env var
2. ~/.emacs.d/.cache/whisper.cpp/models/ggml-{model}.bin
3. ~/.local/share/whisper-cli/ggml-{model}.bin
"""
from __future__ import annotations
import os
import re
import shutil
import subprocess
import tempfile
from pathlib import Path
from typing import Optional
__all__ = ["transcribe", "find_whisper_cli", "find_whisper_model"]
# Default model — tiny is fast enough for interactive use
DEFAULT_MODEL = "tiny"
# Model search directories
_MODEL_DIRS = [
Path.home() / ".emacs.d" / ".cache" / "whisper.cpp" / "models",
Path.home() / ".local" / "share" / "whisper-cli",
Path.home() / ".local" / "share" / "whisper.cpp" / "models",
]
# whisper-cli search paths
_CLI_PATHS = [
Path.home()
/ ".emacs.d"
/ ".cache"
/ "whisper.cpp"
/ "build"
/ "bin"
/ "whisper-cli",
]
[docs]
def find_whisper_cli() -> Optional[str]:
"""Find whisper-cli binary.
Returns
-------
Path to whisper-cli, or None if not found.
"""
# 1. Environment variable override
env_path = os.environ.get("SCITEX_AUDIO_WHISPER_CLI")
if env_path and Path(env_path).is_file():
return env_path
# 2. PATH lookup
which = shutil.which("whisper-cli")
if which:
return which
# 3. Known locations
for p in _CLI_PATHS:
if p.is_file():
return str(p)
return None
[docs]
def find_whisper_model(model: str = DEFAULT_MODEL) -> Optional[str]:
"""Find a whisper model file.
Args:
model: Model name (tiny, base, small, medium, large-v3-turbo, etc.)
Returns
-------
Path to model file, or None if not found.
"""
# 1. Environment variable override
env_path = os.environ.get("SCITEX_AUDIO_WHISPER_MODEL")
if env_path and Path(env_path).is_file():
return env_path
# 2. Search known directories
filename = f"ggml-{model}.bin"
for d in _MODEL_DIRS:
candidate = d / filename
if candidate.is_file():
return str(candidate)
return None
[docs]
def available_models() -> list[str]:
"""List available whisper models.
Returns
-------
List of model names (e.g., ["tiny", "base", "medium"]).
"""
models = set()
pattern = re.compile(r"^ggml-(.+)\.bin$")
for d in _MODEL_DIRS:
if d.is_dir():
for f in d.iterdir():
m = pattern.match(f.name)
if m and not f.name.startswith("for-tests-"):
models.add(m.group(1))
return sorted(models)
def _convert_to_wav(input_path: str, output_path: str) -> None:
"""Convert audio file to WAV format using ffmpeg.
Args:
input_path: Source audio file.
output_path: Destination WAV file.
Raises
------
RuntimeError: If ffmpeg is not available or conversion fails.
"""
ffmpeg = shutil.which("ffmpeg")
if not ffmpeg:
raise RuntimeError("ffmpeg not found. Install with: sudo apt install ffmpeg")
result = subprocess.run(
[ffmpeg, "-y", "-i", input_path, output_path],
capture_output=True,
text=True,
timeout=30,
)
if result.returncode != 0:
raise RuntimeError(f"ffmpeg conversion failed: {result.stderr}")
def _parse_whisper_output(stdout: str) -> list[dict]:
"""Parse whisper-cli output into segments.
Args:
stdout: Raw whisper-cli stdout.
Returns
-------
List of dicts with keys: start, end, text.
"""
segments = []
pattern = re.compile(
r"\[(\d{2}:\d{2}:\d{2}\.\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2}\.\d{3})\]\s*(.*)"
)
for line in stdout.strip().splitlines():
m = pattern.match(line.strip())
if m:
segments.append(
{
"start": m.group(1),
"end": m.group(2),
"text": m.group(3).strip(),
}
)
return segments
[docs]
def transcribe(
audio_path: str,
language: Optional[str] = "ja",
model: str = DEFAULT_MODEL,
whisper_cli: Optional[str] = None,
model_path: Optional[str] = None,
) -> dict:
"""Transcribe audio file to text using whisper.cpp.
Args:
audio_path: Path to audio file (any format ffmpeg supports).
language: Language code (e.g., "ja", "en"). None for auto-detect.
model: Whisper model name (tiny, base, small, medium, large-v3-turbo).
whisper_cli: Override path to whisper-cli binary.
model_path: Override path to model file.
Returns
-------
Dict with keys: success, text, segments, language, model, audio_path.
"""
audio_path = str(Path(audio_path).resolve())
# Find whisper-cli
cli = whisper_cli or find_whisper_cli()
if not cli:
return {
"success": False,
"error": (
"whisper-cli not found. Install whisper.cpp or set "
"SCITEX_AUDIO_WHISPER_CLI environment variable."
),
}
# Find model
mdl = model_path or find_whisper_model(model)
if not mdl:
return {
"success": False,
"error": (
f"Whisper model '{model}' not found. Download with: "
f"cd ~/.emacs.d/.cache/whisper.cpp/models && "
f"bash download-ggml-model.sh {model}"
),
}
# Convert to WAV if needed
wav_path = audio_path
tmp_wav = None
if not audio_path.lower().endswith(".wav"):
fd, tmp_wav = tempfile.mkstemp(suffix=".wav", prefix="scitex_stt_")
os.close(fd)
wav_path = tmp_wav
try:
_convert_to_wav(audio_path, wav_path)
except RuntimeError as e:
if tmp_wav:
_safe_unlink(tmp_wav)
return {"success": False, "error": str(e)}
try:
# Build whisper-cli command
cmd = [cli, "-m", mdl, "-f", wav_path]
if language:
cmd.extend(["-l", language])
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=120,
)
if result.returncode != 0:
return {
"success": False,
"error": f"whisper-cli failed (exit {result.returncode}): {result.stderr}",
}
# Parse output
segments = _parse_whisper_output(result.stdout)
full_text = " ".join(s["text"] for s in segments)
return {
"success": True,
"text": full_text,
"segments": segments,
"language": language,
"model": model,
"audio_path": audio_path,
}
except subprocess.TimeoutExpired:
return {
"success": False,
"error": "Transcription timed out (120s limit)",
}
finally:
if tmp_wav:
_safe_unlink(tmp_wav)
def _safe_unlink(path: str) -> None:
"""Remove file, ignoring errors."""
try:
os.unlink(path)
except Exception:
pass
# EOF