new-vi-tts / app.py
Elvin
init source
ec7446a
# Ubuntu: sudo apt install ffmpeg
# Windows please refer to https://www.geeksforgeeks.org/how-to-install-ffmpeg-on-windows/
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Tell it which GPU to use (or ignore if you're CPU-bound and patient!)
from vinorm import TTSnorm # Gotta normalize that Vietnamese text first
from f5tts_wrapper import F5TTSWrapper # Our handy wrapper class
default_dir = "/kaggle/working/new-vi-tts"
# --- Config ---
# Path to the model checkpoint you downloaded from *this* repo
# MAKE SURE this path points to the actual .pth or .ckpt file!
eraX_ckpt_path = "{default_dir}/infer/model/model_612000.safetensors" # <-- CHANGE THIS!
# Path to the voice you want to clone
ref_audio_path = "{default_dir}/infer/audio.wav" # <-- CHANGE THIS!
# Path to the vocab file from this repo
vocab_file = "{default_dir}/infer/model/vocab.txt" # <-- CHANGE THIS!
# Where to save the generated sound
output_dir = "{default_dir}/audio"
# --- Texts ---
# Text matching the reference audio (helps the model learn the voice). Please make sure it match with the referrence audio!
ref_text = "Họ có quan điểm trái ngược, bạn có thể mỉm cười với họ trong nhà thờ, nhưng ngoài đời thì khó lòng mà làm được. Những suy nghĩ này có quen thuộc với bạn không?"
# --- Let's Go! ---
print("Initializing the TTS engine... (Might take a sec)")
tts = F5TTSWrapper(
vocoder_name="vocos", # Using Vocos vocoder
ckpt_path=eraX_ckpt_path,
vocab_file=vocab_file,
use_ema=False, # ALWAYS False as we converted from .pt to safetensors and EMA (where there is or not) was in there
)
# Normalize the reference text (makes it easier for the model)
ref_text_norm = TTSnorm(ref_text)
# Prepare the output folder
os.makedirs(output_dir, exist_ok=True)
print("Processing the reference voice...")
# Feed the model the reference voice ONCE
# Provide ref_text for better quality, or set ref_text="" to use Whisper for auto-transcription (if installed)
tts.preprocess_reference(
ref_audio_path=ref_audio_path,
ref_text=ref_text_norm,
clip_short=True # Keeps reference audio to a manageable length (~12s)
)
print(f"Reference audio duration used: {tts.get_current_audio_length():.2f} seconds")
# --- Generate New Speech ---
print("Generating new speech with the cloned voice...")
def speak(text,filename):
# Normalize the text we want to speak
text_norm = TTSnorm(text)
# You can generate multiple sentences easily
# Just add more normalized strings to this list
sentences = [text_norm]
for i, sentence in enumerate(sentences):
output_path = os.path.join(output_dir, f"{filename}_{i+1}.wav")
# THE ACTUAL GENERATION HAPPENS HERE!
tts.generate(
text=sentence,
output_path=output_path,
nfe_step=20, # Denoising steps. More = slower but potentially better? (Default: 32)
cfg_strength=2.0, # How strongly to stick to the reference voice style? (Default: 2.0)
speed=1.0, # Make it talk faster or slower (Default: 1.0)
cross_fade_duration=0.15, # Smooths transitions if text is split into chunks (Default: 0.15)
)
print(f"Boom! Audio saved to: {output_path}")
print("\nAll done! Check your output folder.")
dir = '{default_dir}/books'
for filename in os.listdir(dir):
fs = open(dir + '/'+filename, "r")
text = fs.read()
speak(text,filename.split('.')[0])
fs.close()
print('Saved: '+filename)