""" Voice cloning utility for Coqui TTS XTTS v2 with a cached, reusable model service. - Provides a CLI for one-off synthesis - Exposes a clone_voice() API that reuses a loaded model across calls - Exposes warm_model() and is_model_loaded() for backend progress integration """ import argparse import os import sys import threading from typing import Optional try: import torch _HAS_CUDA = torch.cuda.is_available() except Exception: torch = None _HAS_CUDA = False try: from torch.serialization import add_safe_globals except Exception: add_safe_globals = None try: from TTS.config.shared_configs import BaseDatasetConfig except Exception: BaseDatasetConfig = None try: from TTS.tts.configs.xtts_config import XttsConfig except Exception: XttsConfig = None try: from TTS.tts.models.xtts import XttsAudioConfig except Exception: XttsAudioConfig = None from TTS.api import TTS MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2" def _collect_safe_globals(): safe_classes = [] for cls in (BaseDatasetConfig, XttsConfig, XttsAudioConfig): if cls: safe_classes.append(cls) try: from TTS.tts.models.xtts import XttsArgs # type: ignore safe_classes.append(XttsArgs) except Exception: pass return safe_classes class ModelService: """Thread-safe, reusable XTTS model service.""" def __init__(self, device: Optional[str] = None) -> None: self.device = device or ("cuda" if _HAS_CUDA else "cpu") self._tts = None self._load_lock = threading.Lock() def _register_safe_globals(self) -> None: if not add_safe_globals: return safe_classes = _collect_safe_globals() if not safe_classes: return try: add_safe_globals(safe_classes) print(f"[INFO] Registered safe globals: {[c.__name__ for c in safe_classes]}") except Exception as e: print(f"[WARN] Could not register safe globals: {e}") def load(self) -> None: if self._tts is not None: return with self._load_lock: if self._tts is not None: return print(f"[INFO] Loading model '{MODEL_NAME}' on device: {self.device} ...", flush=True) self._register_safe_globals() self._tts = TTS(MODEL_NAME).to(self.device) @property def tts(self): if self._tts is None: self.load() return self._tts def tts_to_file(self, *, text: str, speaker_wav: str, language: str, file_path: str) -> None: if not os.path.isfile(speaker_wav): raise FileNotFoundError(f"Reference voice file not found: {speaker_wav}") os.makedirs(os.path.dirname(file_path) or ".", exist_ok=True) print(f"[INFO] Generating audio => {file_path}", flush=True) self.tts.tts_to_file( text=text, speaker_wav=speaker_wav, language=language, file_path=file_path, ) # Global cache of services per device _SERVICES: dict[str, ModelService] = {} _SERVICES_LOCK = threading.Lock() def get_service(device: Optional[str] = None) -> ModelService: key = (device or ("cuda" if _HAS_CUDA else "cpu")).lower() with _SERVICES_LOCK: svc = _SERVICES.get(key) if svc is None: svc = ModelService(key) svc.load() _SERVICES[key] = svc return svc def is_model_loaded(device: Optional[str] = None) -> bool: """Return True if the model service for the given device is present and loaded.""" key = (device or ("cuda" if _HAS_CUDA else "cpu")).lower() with _SERVICES_LOCK: svc = _SERVICES.get(key) return bool(svc and getattr(svc, "_tts", None) is not None) def warm_model(device: Optional[str] = None) -> None: """Ensure the model for the given device is loaded into memory.""" svc = get_service(device) svc.load() def clone_voice(text: str, speaker_wav: str, language: str, output: str, device: Optional[str] = None) -> None: """Clone a voice using a cached XTTS v2 model and synthesize text to a WAV file. This function is thread-safe and reuses a single model instance per device across repeated calls in the same process (e.g., a Flask app). """ svc = get_service(device) svc.tts_to_file(text=text, speaker_wav=speaker_wav, language=language, file_path=output) print("[SUCCESS] Done.") def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Clone a voice with Coqui TTS XTTS v2 and synthesize text to a WAV file.", ) parser.add_argument("--text", "-t", required=True, help="Text to synthesize.") parser.add_argument("--speaker_wav", "-s", required=True, help="Path to the reference voice WAV file.") parser.add_argument("--language", "-l", default="en", help="Target language code (default: en).") parser.add_argument("--output", "-o", default="output.wav", help="Output WAV file path (default: output.wav).") parser.add_argument( "--device", "-d", choices=["cpu", "cuda"], help="Execution device. Defaults to CUDA if available, otherwise CPU.", ) return parser.parse_args() if __name__ == "__main__": args = parse_args() try: clone_voice( text=args.text, speaker_wav=args.speaker_wav, language=args.language, output=args.output, device=args.device, ) except Exception as e: print(f"[ERROR] {e}", file=sys.stderr) sys.exit(1)