SandraCLV commited on
Commit
440e0b5
1 Parent(s): 69504ad

Update audio_model.py

Browse files
Files changed (1) hide show
  1. audio_model.py +9 -38
audio_model.py CHANGED
@@ -1,55 +1,26 @@
1
- from transformers import AutoProcessor, BlipForConditionalGeneration, AutoTokenizer,SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
2
  import librosa
3
  import numpy as np
4
  import torch
5
-
6
- #CONSTANTS
7
- speaker_embeddings = {
8
- "BDL": "spkemb/cmu_us_bdl_arctic-wav-arctic_a0009.npy",
9
- "CLB": "spkemb/cmu_us_clb_arctic-wav-arctic_a0144.npy",
10
- "RMS": "spkemb/cmu_us_rms_arctic-wav-arctic_b0353.npy",
11
- "SLT": "spkemb/cmu_us_slt_arctic-wav-arctic_a0508.npy",
12
- }
13
 
14
  # Carga el modelo de clasificaci贸n de tetxo a audio speech
15
  checkpoint = "microsoft/speecht5_tts"
16
  processor = SpeechT5Processor.from_pretrained(checkpoint)
17
  model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
18
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 
 
19
 
20
  device = "cuda" if torch.cuda.is_available() else "cpu"
21
 
22
  ### TEXT TO AUDIO SPEECH MODEL 2
23
  # Define la funci贸n que convierte texto en voz
24
- def text_to_speech(text,speaker):
25
- # Genera el audio utilizando el modelo
26
- if len(text.strip()) == 0:
27
- return (16000, np.zeros(0).astype(np.int16))
28
  inputs = processor(text=text, return_tensors="pt")
29
 
30
- # limit input length
31
- input_ids = inputs["input_ids"]
32
- input_ids = input_ids[..., :model.config.max_text_positions]
33
-
34
- if speaker == "Surprise Me!":
35
- # load one of the provided speaker embeddings at random
36
- idx = np.random.randint(len(speaker_embeddings))
37
- key = list(speaker_embeddings.keys())[idx]
38
- speaker_embedding = np.load(speaker_embeddings[key])
39
-
40
- # randomly shuffle the elements
41
- np.random.shuffle(speaker_embedding)
42
-
43
- # randomly flip half the values
44
- x = (np.random.rand(512) >= 0.5) * 1.0
45
- x[x == 0] = -1.0
46
- speaker_embedding *= x
47
-
48
- #speaker_embedding = np.random.rand(512).astype(np.float32) * 0.3 - 0.15
49
- speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
50
-
51
- speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
52
-
53
- speech = (speech.numpy() * 32767).astype(np.int16)
54
- return (16000, speech)
55
  ### END TEXT TO AUDIO SPEECH MODEL 2
 
1
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
2
  import librosa
3
  import numpy as np
4
  import torch
5
+ from datasets import load_dataset
 
 
 
 
 
 
 
6
 
7
  # Carga el modelo de clasificaci贸n de tetxo a audio speech
8
  checkpoint = "microsoft/speecht5_tts"
9
  processor = SpeechT5Processor.from_pretrained(checkpoint)
10
  model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
11
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
12
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
13
+ speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0)
14
 
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
 
17
  ### TEXT TO AUDIO SPEECH MODEL 2
18
  # Define la funci贸n que convierte texto en voz
19
+ def synthesize_speech(text):
20
+ text = cleanup_text(text)
 
 
21
  inputs = processor(text=text, return_tensors="pt")
22
 
23
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
24
+
25
+ return gr.Audio.update(value=(16000, speech.cpu().numpy()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  ### END TEXT TO AUDIO SPEECH MODEL 2