File size: 3,080 Bytes
2ce1681
 
20b170f
2ce1681
 
 
 
 
5724559
2ce1681
f9f3b99
 
 
 
 
 
5724559
 
2ce1681
f9f3b99
5724559
 
 
f9f3b99
 
 
5724559
f9f3b99
 
 
 
 
 
 
5724559
 
 
 
2ce1681
5724559
1716e9e
f9f3b99
 
 
20b170f
5724559
 
2ce1681
 
 
 
 
20b170f
f9f3b99
 
5724559
f9f3b99
5724559
7dd717f
f9f3b99
41afbac
f9f3b99
 
 
 
 
 
 
 
 
20b170f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import gradio as gr
import tempfile
from TTS.api import TTS
from huggingface_hub import hf_hub_download
import torch

CUDA = torch.cuda.is_available()

REPO_ID = "collectivat/catotron-ona"

VOICE_CONVERSION_MODELS = {
    'freevc24': 'voice_conversion_models/multilingual/vctk/freevc24',
    'openvoice_v1': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v1',
    'openvoice_v2': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v2',
}

my_title = "Catotron Text-to-Speech with Voice Conversion"
my_description = "This space allows speaker conversion on Fast Speech based 🐸 [Catotron](https://huggingface.co/collectivat/catotron-ona)." 

my_examples = [
    ["Catotron, síntesi de la parla obert i lliure en català.", True, None, 'freevc24'],
    ["Leonor Ferrer Girabau va ser una delineant, mestra i activista barcelonina, nascuda al carrer actual de la Concòrdia del Poble-sec, que es va convertir en la primera dona a obtenir el títol de delineant a Catalunya i a l'estat.", True, None, 'freevc24'],
    ["S'espera un dia anticiclònic amb temperatures suaus i vent fluix.", False, None, 'freevc24']
]

my_inputs = [
    gr.Textbox(lines=5, label="Input Text"),
    gr.Checkbox(label="Split Sentences", value=False),
    gr.Audio(type="filepath", label="Speaker audio for voice cloning (optional)"),
    gr.Dropdown(label="Voice Conversion Model", choices=list(VOICE_CONVERSION_MODELS.keys())),
]

my_outputs = gr.Audio(type="filepath", label="Output Audio", autoplay=True)

best_model_path = hf_hub_download(repo_id=REPO_ID, filename="fast-speech_best_model.pth") 
config_path = hf_hub_download(repo_id=REPO_ID, filename="fast-speech_config.json")
vocoder_model = hf_hub_download(repo_id=REPO_ID, filename="ljspeech--hifigan_v2_model_file.pth")
vocoder_config = hf_hub_download(repo_id=REPO_ID, filename="ljspeech--hifigan_v2_config.json")

api = TTS(model_path=best_model_path, config_path=config_path, vocoder_path=vocoder_model, vocoder_config_path=vocoder_config).to("cuda" if CUDA else "cpu")

# pre-download voice conversion models
for model in VOICE_CONVERSION_MODELS.values():
    api.load_vc_model_by_name(model, gpu=CUDA)

def tts(text: str, split_sentences: bool = False, speaker_wav: str = None, voice_cv_model: str = 'freevc24'):
    # replace oov characters
    text = text.replace("\n", ". ")
    text = text.replace("(", ",")
    text = text.replace(")", ",")
    text = text.replace(";", ",")

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
        if speaker_wav:
            api.load_vc_model_by_name(VOICE_CONVERSION_MODELS[voice_cv_model], gpu=CUDA)
            api.tts_with_vc_to_file(text, speaker_wav=speaker_wav, file_path=fp.name, split_sentences=split_sentences)
        else:
            api.tts_to_file(text, file_path=fp.name, split_sentences=split_sentences)

    return fp.name

iface = gr.Interface(
    fn=tts, 
    inputs=my_inputs, 
    outputs=my_outputs, 
    title=my_title, 
    description=my_description, 
    examples=my_examples,
    cache_examples=True
)
iface.launch()