alp commited on
Commit
20b170f
·
verified ·
1 Parent(s): 8f87607

Voice conversion try

Browse files
Files changed (1) hide show
  1. app.py +42 -32
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  import tempfile
3
  from TTS.utils.synthesizer import Synthesizer
 
4
  from huggingface_hub import hf_hub_download
5
  import torch
6
 
@@ -8,59 +9,68 @@ CUDA = torch.cuda.is_available()
8
 
9
  REPO_ID = "collectivat/catotron-ona"
10
 
11
- my_title = "Catotron Text-to-Speech"
12
- my_description = "This model is based on Fast Speech implemented in 🐸 [Coqui.ai](https://coqui.ai/)."
 
 
 
 
 
 
13
 
14
  my_examples = [
15
- ["Catotron, sintesi de la parla obert i lliure en català."],
16
- ["Leonor Ferrer Girabau va ser una delineant, mestra i activista barcelonina, nascuda al carrer actual de la Concòrdia del Poble-sec, que es va convertir en la primera dona a obtenir el títol de delineant a Catalunya i a lestat."],
17
- ["S'espera un dia anticiclònic amb temperatures suaus i vent fluix."]
18
  ]
19
 
20
  my_inputs = [
21
- gr.Textbox(lines=5, label="Input Text")
 
 
 
22
  ]
23
 
24
- my_outputs = gr.Audio(type="filepath", label="Output Audio")
25
 
26
- def tts(text: str, split_sentences: bool = True):
27
- best_model_path = hf_hub_download(repo_id=REPO_ID, filename="fast-speech_best_model.pth")
28
- config_path = hf_hub_download(repo_id=REPO_ID, filename="fast-speech_config.json")
29
- vocoder_model = hf_hub_download(repo_id=REPO_ID, filename="ljspeech--hifigan_v2_model_file.pth")
30
- vocoder_config = hf_hub_download(repo_id=REPO_ID, filename="ljspeech--hifigan_v2_config.json")
31
-
32
- synthesizer = Synthesizer(
33
- tts_checkpoint=best_model_path,
34
- tts_config_path=config_path,
35
- tts_speakers_file=None,
36
- tts_languages_file=None,
37
- vocoder_checkpoint=vocoder_model,
38
- vocoder_config=vocoder_config,
39
- encoder_checkpoint="",
40
- encoder_config="",
41
- use_cuda=CUDA
42
- )
43
 
 
 
44
 
 
 
 
 
 
45
  # replace oov characters
46
  text = text.replace("\n", ". ")
47
  text = text.replace("(", ",")
48
  text = text.replace(")", ",")
49
  text = text.replace(";", ",")
50
 
51
- # create audio file
52
- wavs = synthesizer.tts(text, split_sentences=split_sentences)
53
- with tempfile.NamedTemporaryFile(suffix = ".wav", delete = False) as fp:
54
- synthesizer.save_wav(wavs, fp)
55
- return fp.name
 
 
 
 
 
56
 
57
  iface = gr.Interface(
58
  fn=tts,
59
  inputs=my_inputs,
60
  outputs=my_outputs,
61
  title=my_title,
62
- description = my_description,
63
- examples = my_examples,
64
  cache_examples=True
65
  )
66
- iface.launch()
 
1
  import gradio as gr
2
  import tempfile
3
  from TTS.utils.synthesizer import Synthesizer
4
+ from TTS.api import TTS
5
  from huggingface_hub import hf_hub_download
6
  import torch
7
 
 
9
 
10
  REPO_ID = "collectivat/catotron-ona"
11
 
12
+ VOICE_CONVERSION_MODELS = {
13
+ 'freevc24': 'voice_conversion_models/multilingual/vctk/freevc24',
14
+ 'openvoice_v1': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v1',
15
+ 'openvoice_v2': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v2',
16
+ }
17
+
18
+ my_title = "Catotron Text-to-Speech amb Conversió de Veu"
19
+ my_description = "This model is based on Fast Speech implemented in 🐸 [Coqui.ai](https://coqui.ai/). Now with voice conversion capabilities!"
20
 
21
  my_examples = [
22
+ ["Catotron, síntesi de la parla obert i lliure en català."],
23
+ ["Leonor Ferrer Girabau va ser una delineant, mestra i activista barcelonina, nascuda al carrer actual de la Concòrdia del Poble-sec, que es va convertir en la primera dona a obtenir el títol de delineant a Catalunya i a l'estat."],
24
+ ["S'espera un dia anticiclònic amb temperatures suaus i vent fluix."]
25
  ]
26
 
27
  my_inputs = [
28
+ gr.Textbox(lines=5, label="Input Text"),
29
+ gr.Checkbox(label="Split Sentences", value=True),
30
+ gr.Audio(type="filepath", label="Speaker audio for voice cloning (optional)"),
31
+ gr.Dropdown(label="Voice Conversion Model", choices=list(VOICE_CONVERSION_MODELS.keys()), value='freevc24'),
32
  ]
33
 
34
+ my_outputs = gr.Audio(type="filepath", label="Output Audio", autoplay=True)
35
 
36
+ # Download model files
37
+ best_model_path = hf_hub_download(repo_id=REPO_ID, filename="fast-speech_best_model.pth")
38
+ config_path = hf_hub_download(repo_id=REPO_ID, filename="fast-speech_config.json")
39
+ vocoder_model = hf_hub_download(repo_id=REPO_ID, filename="ljspeech--hifigan_v2_model_file.pth")
40
+ vocoder_config = hf_hub_download(repo_id=REPO_ID, filename="ljspeech--hifigan_v2_config.json")
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
+ # Initialize the TTS API for voice conversion
43
+ tts_api = TTS(model_path=best_model_path, config_path=config_path, vocoder_path=vocoder_model, vocoder_config_path=vocoder_config).to("cuda" if CUDA else "cpu")
44
 
45
+ # Pre-download voice conversion models
46
+ for model in VOICE_CONVERSION_MODELS.values():
47
+ tts_api.load_vc_model_by_name(model, gpu=CUDA)
48
+
49
+ def tts(text: str, split_sentences: bool = True, speaker_wav: str = None, voice_cv_model: str = 'freevc24'):
50
  # replace oov characters
51
  text = text.replace("\n", ". ")
52
  text = text.replace("(", ",")
53
  text = text.replace(")", ",")
54
  text = text.replace(";", ",")
55
 
56
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
57
+ if speaker_wav:
58
+ # Use voice conversion
59
+ tts_api.load_vc_model_by_name(VOICE_CONVERSION_MODELS[voice_cv_model], gpu=CUDA)
60
+ tts_api.tts_with_vc_to_file(text, speaker_wav=speaker_wav, file_path=fp.name, split_sentences=split_sentences)
61
+ else:
62
+ # Standard TTS without voice conversion
63
+ tts_api.tts_to_file(text, file_path=fp.name, split_sentences=split_sentences)
64
+
65
+ return fp.name
66
 
67
  iface = gr.Interface(
68
  fn=tts,
69
  inputs=my_inputs,
70
  outputs=my_outputs,
71
  title=my_title,
72
+ description=my_description,
73
+ examples=my_examples,
74
  cache_examples=True
75
  )
76
+ iface.launch()