Spaces:
Running
Running
File size: 3,080 Bytes
2ce1681 20b170f 2ce1681 5724559 2ce1681 f9f3b99 5724559 2ce1681 f9f3b99 5724559 f9f3b99 5724559 f9f3b99 5724559 2ce1681 5724559 1716e9e f9f3b99 20b170f 5724559 2ce1681 20b170f f9f3b99 5724559 f9f3b99 5724559 7dd717f f9f3b99 41afbac f9f3b99 20b170f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import gradio as gr
import tempfile
from TTS.api import TTS
from huggingface_hub import hf_hub_download
import torch
CUDA = torch.cuda.is_available()
REPO_ID = "collectivat/catotron-ona"
VOICE_CONVERSION_MODELS = {
'freevc24': 'voice_conversion_models/multilingual/vctk/freevc24',
'openvoice_v1': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v1',
'openvoice_v2': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v2',
}
my_title = "Catotron Text-to-Speech with Voice Conversion"
my_description = "This space allows speaker conversion on Fast Speech based 🐸 [Catotron](https://huggingface.co/collectivat/catotron-ona)."
my_examples = [
["Catotron, síntesi de la parla obert i lliure en català.", True, None, 'freevc24'],
["Leonor Ferrer Girabau va ser una delineant, mestra i activista barcelonina, nascuda al carrer actual de la Concòrdia del Poble-sec, que es va convertir en la primera dona a obtenir el títol de delineant a Catalunya i a l'estat.", True, None, 'freevc24'],
["S'espera un dia anticiclònic amb temperatures suaus i vent fluix.", False, None, 'freevc24']
]
my_inputs = [
gr.Textbox(lines=5, label="Input Text"),
gr.Checkbox(label="Split Sentences", value=False),
gr.Audio(type="filepath", label="Speaker audio for voice cloning (optional)"),
gr.Dropdown(label="Voice Conversion Model", choices=list(VOICE_CONVERSION_MODELS.keys())),
]
my_outputs = gr.Audio(type="filepath", label="Output Audio", autoplay=True)
best_model_path = hf_hub_download(repo_id=REPO_ID, filename="fast-speech_best_model.pth")
config_path = hf_hub_download(repo_id=REPO_ID, filename="fast-speech_config.json")
vocoder_model = hf_hub_download(repo_id=REPO_ID, filename="ljspeech--hifigan_v2_model_file.pth")
vocoder_config = hf_hub_download(repo_id=REPO_ID, filename="ljspeech--hifigan_v2_config.json")
api = TTS(model_path=best_model_path, config_path=config_path, vocoder_path=vocoder_model, vocoder_config_path=vocoder_config).to("cuda" if CUDA else "cpu")
# pre-download voice conversion models
for model in VOICE_CONVERSION_MODELS.values():
api.load_vc_model_by_name(model, gpu=CUDA)
def tts(text: str, split_sentences: bool = False, speaker_wav: str = None, voice_cv_model: str = 'freevc24'):
# replace oov characters
text = text.replace("\n", ". ")
text = text.replace("(", ",")
text = text.replace(")", ",")
text = text.replace(";", ",")
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
if speaker_wav:
api.load_vc_model_by_name(VOICE_CONVERSION_MODELS[voice_cv_model], gpu=CUDA)
api.tts_with_vc_to_file(text, speaker_wav=speaker_wav, file_path=fp.name, split_sentences=split_sentences)
else:
api.tts_to_file(text, file_path=fp.name, split_sentences=split_sentences)
return fp.name
iface = gr.Interface(
fn=tts,
inputs=my_inputs,
outputs=my_outputs,
title=my_title,
description=my_description,
examples=my_examples,
cache_examples=True
)
iface.launch() |