vi-tts

Runtime error

App Files Files Community

vi-tts / app.py

elvinan

Update process bar

7ea2417 11 months ago

raw

history blame contribute delete

8.43 kB

	import torch # isort:skip

	torch.manual_seed(42)
	import json
	import re
	import unicodedata
	from types import SimpleNamespace
	import time
	import numpy as np
	import regex
	from scipy.io.wavfile import write
	from models import DurationNet, SynthesizerTrn
	import os
	import re

	from process import print_percent_done

	title = "LightSpeed: Vietnamese Male Voice TTS"
	description = "Vietnam Male Voice TTS."
	config_file = "config.json"
	duration_model_path = "vbx_duration_model.pth"
	lightspeed_model_path = "gen_619k.pth"
	phone_set_file = "vbx_phone_set.json"
	device = "cuda" if torch.cuda.is_available() else "cpu"
	with open(config_file, "rb") as f:
	hps = json.load(f, object_hook=lambda x: SimpleNamespace(**x))

	# load phone set json file
	with open(phone_set_file, "r") as f:
	phone_set = json.load(f)

	assert phone_set[0][1:-1] == "SEP"
	assert "sil" in phone_set
	sil_idx = phone_set.index("sil")

	space_re = regex.compile(r"\s+")
	number_re = regex.compile("([0-9]+)")
	digits = ["không", "một", "hai", "ba", "bốn", "năm", "sáu", "bảy", "tám", "chín"]
	num_re = regex.compile(r"([0-9.,]*[0-9])")
	alphabet = "aàáảãạăằắẳẵặâầấẩẫậeèéẻẽẹêềếểễệiìíỉĩịoòóỏõọôồốổỗộơờớởỡợuùúủũụưừứửữựyỳýỷỹỵbcdđghklmnpqrstvx"
	keep_text_and_num_re = regex.compile(rf"[^\s{alphabet}.,0-9]")
	keep_text_re = regex.compile(rf"[^\s{alphabet}]")


	def read_number(num: str) -> str:
	if len(num) == 1:
	return digits[int(num)]
	elif len(num) == 2 and num.isdigit():
	n = int(num)
	end = digits[n % 10]
	if n == 10:
	return "mười"
	if n % 10 == 5:
	end = "lăm"
	if n % 10 == 0:
	return digits[n // 10] + " mươi"
	elif n < 20:
	return "mười " + end
	else:
	if n % 10 == 1:
	end = "mốt"
	return digits[n // 10] + " mươi " + end
	elif len(num) == 3 and num.isdigit():
	n = int(num)
	if n % 100 == 0:
	return digits[n // 100] + " trăm"
	elif num[1] == "0":
	return digits[n // 100] + " trăm lẻ " + digits[n % 100]
	else:
	return digits[n // 100] + " trăm " + read_number(num[1:])
	elif len(num) >= 4 and len(num) <= 6 and num.isdigit():
	n = int(num)
	n1 = n // 1000
	return read_number(str(n1)) + " ngàn " + read_number(num[-3:])
	elif "," in num:
	n1, n2 = num.split(",")
	return read_number(n1) + " phẩy " + read_number(n2)
	elif "." in num:
	parts = num.split(".")
	if len(parts) == 2:
	if parts[1] == "000":
	return read_number(parts[0]) + " ngàn"
	elif parts[1].startswith("00"):
	end = digits[int(parts[1][2:])]
	return read_number(parts[0]) + " ngàn lẻ " + end
	else:
	return read_number(parts[0]) + " ngàn " + read_number(parts[1])
	elif len(parts) == 3:
	return (
	read_number(parts[0])
	+ " triệu "
	+ read_number(parts[1])
	+ " ngàn "
	+ read_number(parts[2])
	)
	return num


	def text_to_phone_idx(text):
	# lowercase
	text = text.lower()
	# unicode normalize
	text = unicodedata.normalize("NFKC", text)
	text = text.replace(".", " . ")
	text = text.replace(",", " , ")
	text = text.replace(";", " ; ")
	text = text.replace(":", " : ")
	text = text.replace("!", " ! ")
	text = text.replace("?", " ? ")
	text = text.replace("(", " ( ")

	text = num_re.sub(r" \1 ", text)
	words = text.split()
	words = [read_number(w) if num_re.fullmatch(w) else w for w in words]
	text = " ".join(words)

	# remove redundant spaces
	text = re.sub(r"\s+", " ", text)
	# remove leading and trailing spaces
	text = text.strip()
	# convert words to phone indices
	tokens = []
	for c in text:
	# if c is "," or ".", add <sil> phone
	if c in ":,.!?;(":
	tokens.append(sil_idx)
	elif c in phone_set:
	tokens.append(phone_set.index(c))
	elif c == " ":
	# add <sep> phone
	tokens.append(0)
	if(len(tokens)==0):
	return tokens
	if tokens[0] != sil_idx:
	# insert <sil> phone at the beginning
	tokens = [sil_idx, 0] + tokens
	if tokens[-1] != sil_idx:
	tokens = tokens + [0, sil_idx]
	return tokens


	def text_to_speech(duration_net, generator, text):

	# Convert Bible address
	text = re.sub(r"(\d+):(\d+)", r"chương \1 câu \2", text)

	# Convert Israel name
	# Function to capitalize each part of the name
	def capitalize_name(match):
	return match.group(0).replace("-", " ").title()

	# Apply the function to each match
	text = re.sub(r"\b\w+(?:-\w+)+\b", capitalize_name, text)

	# Split numbers from text
	text = re.sub(r"(\d+)(\D+)", r"\1 \2", text)

	phone_idx = text_to_phone_idx(text)

	batch = {
	"phone_idx": np.array([phone_idx]),
	"phone_length": np.array([len(phone_idx)]),
	}

	# predict phoneme duration
	phone_length = torch.from_numpy(batch["phone_length"].copy()).long().to(device)
	phone_idx = torch.from_numpy(batch["phone_idx"].copy()).long().to(device)
	with torch.inference_mode():
	phone_duration = duration_net(phone_idx, phone_length)[:, :, 0] * 1000
	phone_duration = torch.where(
	phone_idx == sil_idx, torch.clamp_min(phone_duration, 200), phone_duration
	)
	phone_duration = torch.where(phone_idx == 0, 0, phone_duration)

	# generate waveform
	end_time = torch.cumsum(phone_duration, dim=-1)
	start_time = end_time - phone_duration
	start_frame = start_time / 1000 * hps.data.sampling_rate / hps.data.hop_length
	end_frame = end_time / 1000 * hps.data.sampling_rate / hps.data.hop_length
	spec_length = end_frame.max(dim=-1).values
	pos = torch.arange(0, spec_length.item(), device=device)
	attn = torch.logical_and(
	pos[None, :, None] >= start_frame[:, None, :],
	pos[None, :, None] < end_frame[:, None, :],
	).float()
	with torch.inference_mode():
	y_hat = generator.infer(
	phone_idx, phone_length, spec_length, attn, max_len=None, noise_scale=0.667
	)[0]
	wave = y_hat[0, 0].data.cpu().numpy()
	return (wave * (2**15)).astype(np.int16)


	def load_models():
	duration_net = DurationNet(hps.data.vocab_size, 64, 4).to(device)
	duration_net.load_state_dict(torch.load(duration_model_path, map_location=device))
	duration_net = duration_net.eval()
	generator = SynthesizerTrn(
	hps.data.vocab_size,
	hps.data.filter_length // 2 + 1,
	hps.train.segment_size // hps.data.hop_length,
	**vars(hps.model),
	).to(device)
	del generator.enc_q
	ckpt = torch.load(lightspeed_model_path, map_location=device)
	params = {}
	for k, v in ckpt["net_g"].items():
	k = k[7:] if k.startswith("module.") else k
	params[k] = v
	generator.load_state_dict(params, strict=False)
	del ckpt, params
	generator = generator.eval()
	return duration_net, generator


	def speak(text,filename):
	duration_net, generator = load_models()
	paragraphs = text.split("\n")
	clips = [] # list of audio clips
	# silence = np.zeros(hps.data.sampling_rate // 4)
	count = 0;
	for paragraph in paragraphs:
	paragraph = paragraph.strip();

	#remove special characters (*, #, &, ^, @, [, ], {, })
	paragraph = re.sub(r"[*#&^@\[\]{}]", "", paragraph)

	if paragraph == "":
	continue
	clips.append(text_to_speech(duration_net, generator, paragraph))

	# print process percentage
	process = round(len(clips) / len(paragraphs) * 100)
	print_percent_done(process, 100, 50, 'Processing ' + filename)

	# clips.append(silence)
	y = np.concatenate(clips)
	#save audio to local hps.data.sampling_rate as wav file
	write('/kaggle/working/'+ filename+ str(time.time())+'.wav' ,hps.data.sampling_rate, y)
	return hps.data.sampling_rate, y

	dir = '/kaggle/working/vi-tts/books'

	for filename in os.listdir(dir):
	fs = open(dir + '/'+filename, "r")
	text = fs.read()
	speak(text,filename.split('.')[0])
	fs.close()
	print('Saved: '+filename)