catotron-vc

Running

App Files Files Community

alp commited on Nov 5

Commit

f9f3b99

verified ·

1 Parent(s): 9eca7d4

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -49

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import gradio as gr
 import tempfile
-from TTS.utils.synthesizer import Synthesizer
 from TTS.api import TTS
 from huggingface_hub import hf_hub_download
 import torch
@@ -9,70 +8,64 @@ CUDA = torch.cuda.is_available()
 REPO_ID = "collectivat/catotron-ona"
 my_title = "Catotron Text-to-Speech"
 my_description = "This model is based on Fast Speech implemented in 🐸 [Coqui.ai](https://coqui.ai/)."
-# Download Catotron model files
 best_model_path = hf_hub_download(repo_id=REPO_ID, filename="fast-speech_best_model.pth")
 config_path = hf_hub_download(repo_id=REPO_ID, filename="fast-speech_config.json")
 vocoder_model = hf_hub_download(repo_id=REPO_ID, filename="ljspeech--hifigan_v2_model_file.pth")
 vocoder_config = hf_hub_download(repo_id=REPO_ID, filename="ljspeech--hifigan_v2_config.json")
-# Initialize Synthesizer for Catotron
-synthesizer = Synthesizer(
-    tts_checkpoint=best_model_path,
-    tts_config_path=config_path,
-    tts_speakers_file=None,
-    tts_languages_file=None,
-    vocoder_checkpoint=vocoder_model,
-    vocoder_config=vocoder_config,
-    encoder_checkpoint="",
-    encoder_config="",
-    use_cuda=CUDA
-)
-# Initialize voice conversion model
-vc_model = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", gpu=CUDA)
-def tts(text, split_sentences, speaker_wav):
     text = text.replace("\n", ". ")
     text = text.replace("(", ",")
     text = text.replace(")", ",")
     text = text.replace(";", ",")
-    # Generate with Catotron
-    wavs = synthesizer.tts(text, split_sentences=split_sentences)
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
-        synthesizer.save_wav(wavs, fp)
-        temp_catotron = fp.name
-    # Apply voice conversion if speaker provided
-    if speaker_wav is not None:
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp_out:
-            vc_model.voice_conversion_to_file(source_wav=temp_catotron, target_wav=speaker_wav, file_path=fp_out.name)
-            return fp_out.name
-    return temp_catotron
-with gr.Blocks() as iface:
-    gr.Markdown(f"# {my_title}")
-    gr.Markdown(my_description)
-    with gr.Row():
-        with gr.Column():
-            text_input = gr.Textbox(lines=5, label="Input Text")
-            split_check = gr.Checkbox(label="Split Sentences", value=True)
-            speaker_audio = gr.Audio(label="Speaker audio for voice cloning (optional)", type="filepath")
-            submit_btn = gr.Button("Generate")
-        with gr.Column():
-            audio_output = gr.Audio(label="Output Audio", type="filepath")
-    submit_btn.click(
-        fn=tts,
-        inputs=[text_input, split_check, speaker_audio],
-        outputs=audio_output
-    )
 iface.launch()

 import gradio as gr
 import tempfile
 from TTS.api import TTS
 from huggingface_hub import hf_hub_download
 import torch
 REPO_ID = "collectivat/catotron-ona"
+VOICE_CONVERSION_MODELS = {
+    'freevc24': 'voice_conversion_models/multilingual/vctk/freevc24',
+    'openvoice_v1': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v1',
+    'openvoice_v2': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v2',
+}
 my_title = "Catotron Text-to-Speech"
 my_description = "This model is based on Fast Speech implemented in 🐸 [Coqui.ai](https://coqui.ai/)."
+my_examples = [
+    ["Catotron, síntesi de la parla obert i lliure en català.", True, None, 'freevc24'],
+    ["Leonor Ferrer Girabau va ser una delineant, mestra i activista barcelonina, nascuda al carrer actual de la Concòrdia del Poble-sec, que es va convertir en la primera dona a obtenir el títol de delineant a Catalunya i a l'estat.", True, None, 'freevc24'],
+    ["S'espera un dia anticiclònic amb temperatures suaus i vent fluix.", False, None, 'freevc24']
+]
+my_inputs = [
+    gr.Textbox(lines=5, label="Input Text"),
+    gr.Checkbox(label="Split Sentences", value=False),
+    gr.Audio(type="filepath", label="Speaker audio for voice cloning (optional)"),
+    gr.Dropdown(label="Voice Conversion Model", choices=list(VOICE_CONVERSION_MODELS.keys())),
+]
+my_outputs = gr.Audio(type="filepath", label="Output Audio", autoplay=True)
 best_model_path = hf_hub_download(repo_id=REPO_ID, filename="fast-speech_best_model.pth")
 config_path = hf_hub_download(repo_id=REPO_ID, filename="fast-speech_config.json")
 vocoder_model = hf_hub_download(repo_id=REPO_ID, filename="ljspeech--hifigan_v2_model_file.pth")
 vocoder_config = hf_hub_download(repo_id=REPO_ID, filename="ljspeech--hifigan_v2_config.json")
+api = TTS(model_path=best_model_path, config_path=config_path, vocoder_path=vocoder_model, vocoder_config_path=vocoder_config).to("cuda" if CUDA else "cpu")
+# pre-download voice conversion models
+for model in VOICE_CONVERSION_MODELS.values():
+    api.load_vc_model_by_name(model, gpu=CUDA)
+def tts(text: str, split_sentences: bool = False, speaker_wav: str = None, voice_cv_model: str = 'freevc24'):
+    # replace oov characters
     text = text.replace("\n", ". ")
     text = text.replace("(", ",")
     text = text.replace(")", ",")
     text = text.replace(";", ",")
     with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+        if speaker_wav:
+            api.load_vc_model_by_name(VOICE_CONVERSION_MODELS[voice_cv_model], gpu=CUDA)
+            api.tts_with_vc_to_file(text, speaker_wav=speaker_wav, file_path=fp.name, split_sentences=split_sentences)
+        else:
+            api.tts_to_file(text, file_path=fp.name, split_sentences=split_sentences)
+    return fp.name
+iface = gr.Interface(
+    fn=tts,
+    inputs=my_inputs,
+    outputs=my_outputs,
+    title=my_title,
+    description=my_description,
+    examples=my_examples,
+    cache_examples=True
+)
 iface.launch()