catotron-vc

Running

App Files Files Community

alp commited on Nov 5

Commit

20b170f

verified ·

1 Parent(s): 8f87607

Voice conversion try

Browse files

Files changed (1) hide show

app.py +42 -32

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import gradio as gr
 import tempfile
 from TTS.utils.synthesizer import Synthesizer
 from huggingface_hub import hf_hub_download
 import torch
@@ -8,59 +9,68 @@ CUDA = torch.cuda.is_available()
 REPO_ID = "collectivat/catotron-ona"
-my_title = "Catotron Text-to-Speech"
-my_description = "This model is based on Fast Speech implemented in 🐸 [Coqui.ai](https://coqui.ai/)."
 my_examples = [
-  ["Catotron, sintesi de la parla obert i lliure en català."],
-  ["Leonor Ferrer Girabau va ser una delineant, mestra i activista barcelonina, nascuda al carrer actual de la Concòrdia del Poble-sec, que es va convertir en la primera dona a obtenir el títol de delineant a Catalunya i a l’estat."],
-  ["S'espera un dia anticiclònic amb temperatures suaus i vent fluix."]
 ]
 my_inputs = [
-  gr.Textbox(lines=5, label="Input Text")
 ]
-my_outputs = gr.Audio(type="filepath", label="Output Audio")
-def tts(text: str, split_sentences: bool = True):
-    best_model_path = hf_hub_download(repo_id=REPO_ID, filename="fast-speech_best_model.pth")
-    config_path = hf_hub_download(repo_id=REPO_ID, filename="fast-speech_config.json")
-    vocoder_model = hf_hub_download(repo_id=REPO_ID, filename="ljspeech--hifigan_v2_model_file.pth")
-    vocoder_config = hf_hub_download(repo_id=REPO_ID, filename="ljspeech--hifigan_v2_config.json")
-    synthesizer = Synthesizer(
-        tts_checkpoint=best_model_path,
-        tts_config_path=config_path,
-        tts_speakers_file=None,
-        tts_languages_file=None,
-        vocoder_checkpoint=vocoder_model,
-        vocoder_config=vocoder_config,
-        encoder_checkpoint="",
-        encoder_config="",
-        use_cuda=CUDA
-    )
     # replace oov characters
     text = text.replace("\n", ". ")
     text = text.replace("(", ",")
     text = text.replace(")", ",")
     text = text.replace(";", ",")
-    # create audio file
-    wavs = synthesizer.tts(text, split_sentences=split_sentences)
-    with tempfile.NamedTemporaryFile(suffix = ".wav", delete = False) as fp:
-        synthesizer.save_wav(wavs, fp)
-    return fp.name
 iface = gr.Interface(
     fn=tts,
     inputs=my_inputs,
     outputs=my_outputs,
     title=my_title,
-    description = my_description,
-    examples = my_examples,
     cache_examples=True
 )
-iface.launch()

 import gradio as gr
 import tempfile
 from TTS.utils.synthesizer import Synthesizer
+from TTS.api import TTS
 from huggingface_hub import hf_hub_download
 import torch
 REPO_ID = "collectivat/catotron-ona"
+VOICE_CONVERSION_MODELS = {
+    'freevc24': 'voice_conversion_models/multilingual/vctk/freevc24',
+    'openvoice_v1': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v1',
+    'openvoice_v2': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v2',
+}
+my_title = "Catotron Text-to-Speech amb Conversió de Veu"
+my_description = "This model is based on Fast Speech implemented in 🐸 [Coqui.ai](https://coqui.ai/). Now with voice conversion capabilities!"
 my_examples = [
+    ["Catotron, síntesi de la parla obert i lliure en català."],
+    ["Leonor Ferrer Girabau va ser una delineant, mestra i activista barcelonina, nascuda al carrer actual de la Concòrdia del Poble-sec, que es va convertir en la primera dona a obtenir el títol de delineant a Catalunya i a l'estat."],
+    ["S'espera un dia anticiclònic amb temperatures suaus i vent fluix."]
 ]
 my_inputs = [
+    gr.Textbox(lines=5, label="Input Text"),
+    gr.Checkbox(label="Split Sentences", value=True),
+    gr.Audio(type="filepath", label="Speaker audio for voice cloning (optional)"),
+    gr.Dropdown(label="Voice Conversion Model", choices=list(VOICE_CONVERSION_MODELS.keys()), value='freevc24'),
 ]
+my_outputs = gr.Audio(type="filepath", label="Output Audio", autoplay=True)
+# Download model files
+best_model_path = hf_hub_download(repo_id=REPO_ID, filename="fast-speech_best_model.pth")
+config_path = hf_hub_download(repo_id=REPO_ID, filename="fast-speech_config.json")
+vocoder_model = hf_hub_download(repo_id=REPO_ID, filename="ljspeech--hifigan_v2_model_file.pth")
+vocoder_config = hf_hub_download(repo_id=REPO_ID, filename="ljspeech--hifigan_v2_config.json")
+# Initialize the TTS API for voice conversion
+tts_api = TTS(model_path=best_model_path, config_path=config_path, vocoder_path=vocoder_model, vocoder_config_path=vocoder_config).to("cuda" if CUDA else "cpu")
+# Pre-download voice conversion models
+for model in VOICE_CONVERSION_MODELS.values():
+    tts_api.load_vc_model_by_name(model, gpu=CUDA)
+def tts(text: str, split_sentences: bool = True, speaker_wav: str = None, voice_cv_model: str = 'freevc24'):
     # replace oov characters
     text = text.replace("\n", ". ")
     text = text.replace("(", ",")
     text = text.replace(")", ",")
     text = text.replace(";", ",")
+    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
+        if speaker_wav:
+            # Use voice conversion
+            tts_api.load_vc_model_by_name(VOICE_CONVERSION_MODELS[voice_cv_model], gpu=CUDA)
+            tts_api.tts_with_vc_to_file(text, speaker_wav=speaker_wav, file_path=fp.name, split_sentences=split_sentences)
+        else:
+            # Standard TTS without voice conversion
+            tts_api.tts_to_file(text, file_path=fp.name, split_sentences=split_sentences)
+    return fp.name
 iface = gr.Interface(
     fn=tts,
     inputs=my_inputs,
     outputs=my_outputs,
     title=my_title,
+    description=my_description,
+    examples=my_examples,
     cache_examples=True
 )
+iface.launch()