import gradio as gr import os import tempfile import requests import soundfile as sf import json import shutil from pathlib import Path import numpy as np import re from typing import Generator # ===== NEUTTS IMPORTS ===== try: # Try multiple import approaches for NeuTTS try: # Approach 1: Direct import from the structure from neutts import NeuTTSAir except ImportError: try: # Approach 2: Import from the module directly import sys sys.path.append('/usr/local/lib/python3.10/site-packages') from neutts import NeuTTSAir except ImportError: # Approach 3: Use the components directly from phonemizer.backend import EspeakBackend import perth from neucodec import NeuCodec from llama_cpp import Llama # Define NeuTTSAir class manually class NeuTTSAir: def __init__(self, backbone_repo="neuphonic/neutts-air-q4-gguf", backbone_device="cpu", codec_repo="neuphonic/neucodec", codec_device="cpu"): self.sample_rate = 24_000 self.max_context = 2048 self.hop_length = 480 print("🧠 Loading phonemizer...") self.phonemizer = EspeakBackend(language="en-us", preserve_punctuation=True, with_stress=True) self._load_backbone(backbone_repo, backbone_device) self._load_codec(codec_repo, codec_device) self.watermarker = perth.PerthImplicitWatermarker() print("✅ NeuTTS-Air initialized!") def _load_backbone(self, backbone_repo, backbone_device): print(f"🔧 Loading Q4 GGUF backbone: {backbone_repo}") self.backbone = Llama.from_pretrained( repo_id=backbone_repo, filename="*.gguf", n_ctx=self.max_context, n_gpu_layers=0, verbose=False, use_mlock=False, n_threads=2, low_vram=True ) def _load_codec(self, codec_repo, codec_device): print(f"🔧 Loading codec: {codec_repo}") self.codec = NeuCodec.from_pretrained(codec_repo) self.codec.eval().to(codec_device) def infer(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> np.ndarray: output_str = self._infer_gguf(ref_codes, ref_text, text) wav = self._decode(output_str) watermarked_wav = self.watermarker.apply_watermark(wav, sample_rate=24000) return watermarked_wav def encode_reference(self, ref_audio_path: str | Path): import torch import librosa wav, _ = librosa.load(ref_audio_path, sr=16000, mono=True) wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0) with torch.no_grad(): ref_codes = self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0) return ref_codes.numpy() if isinstance(ref_codes, torch.Tensor) else ref_codes def _decode(self, codes: str): speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes)] if len(speech_ids) > 0: import torch with torch.no_grad(): codes_tensor = torch.tensor(speech_ids, dtype=torch.long)[None, None, :].to(self.codec.device) recon = self.codec.decode_code(codes_tensor).cpu().numpy() return recon[0, 0, :] else: raise ValueError("No speech tokens found") def _to_phones(self, text: str) -> str: phones = self.phonemizer.phonemize([text]) return " ".join(phones[0].split()) def _infer_gguf(self, ref_codes: list, ref_text: str, input_text: str) -> str: ref_text_phones = self._to_phones(ref_text) input_text_phones = self._to_phones(input_text) if isinstance(ref_codes, (torch.Tensor, np.ndarray)): ref_codes = ref_codes.tolist() codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes]) prompt = f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text_phones} {input_text_phones}<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}" output = self.backbone( prompt, max_tokens=self.max_context, temperature=1.0, top_k=50, stop=["<|SPEECH_GENERATION_END|>"], echo=False ) return output["choices"][0]["text"] NEUTTS_AVAILABLE = True print("✅ NeuTTS-Air loaded successfully!") except Exception as e: NEUTTS_AVAILABLE = False print(f"❌ NeuTTS-Air import failed: {e}") # ===== CONFIGURATION ===== CONFIG_FILE = "voice_profiles.json" SAMPLE_DIR = "samples" os.makedirs(SAMPLE_DIR, exist_ok=True) # ===== VOICE PROFILE MANAGEMENT ===== class VoiceProfileManager: def __init__(self, config_file=CONFIG_FILE): self.config_file = config_file self.profiles = self.load_profiles() def load_profiles(self): if os.path.exists(self.config_file): with open(self.config_file, 'r') as f: return json.load(f) return { "dave": { "audio_path": "samples/dave.wav", "text": "Hey there, this is Dave speaking.", "created_at": "default" }, "andrea": { "audio_path": "samples/andrea.wav", "text": "Hello, my name is Andrea.", "created_at": "default" } } def save_profiles(self): with open(self.config_file, 'w') as f: json.dump(self.profiles, f, indent=2) def add_profile(self, name, audio_path, text): self.profiles[name] = { "audio_path": audio_path, "text": text, "created_at": str(np.datetime64('now')) } self.save_profiles() return f"✅ Voice profile '{name}' saved!" def get_profile(self, name): return self.profiles.get(name) def list_profiles(self): return list(self.profiles.keys()) # ===== SAMPLE MANAGEMENT ===== def download_default_samples(): """Download default sample voices""" samples = { "dave": { "audio": "https://github.com/neophonic/neutts-air/raw/main/samples/dave.wav", "text": "Hey there, this is Dave speaking." }, "andrea": { "audio": "https://github.com/neophonic/neutts-air/raw/main/samples/andrea.wav", "text": "Hello, my name is Andrea." } } for name, urls in samples.items(): audio_path = f"{SAMPLE_DIR}/{name}.wav" text_path = f"{SAMPLE_DIR}/{name}.txt" if not os.path.exists(audio_path): print(f"📥 Downloading {name} sample...") try: # Download audio response = requests.get(urls["audio"], timeout=60) response.raise_for_status() # Check for download errors with open(audio_path, 'wb') as f: f.write(response.content) # Write text with open(text_path, 'w') as f: f.write(urls["text"]) print(f"✅ Finished downloading {name}.") # <-- Corrected line except requests.exceptions.RequestException as e: print(f"❌ Failed to download {name}: {e}"))