FrederikRautenberg commited on
Commit
d161181
·
1 Parent(s): 7efb86f

Rename Sliders and refactor text input

Browse files
Dataset/dataset.yaml CHANGED
@@ -1,4 +1,10 @@
1
  dataset:
 
 
 
 
 
 
2
  '7190_90542_000054_000000':
3
  speaker_id: '7190'
4
  example_id: '7190_90542_000054_000000'
@@ -29,9 +35,6 @@ dataset:
29
  '8758_296465_000020_000000':
30
  speaker_id: '8758'
31
  example_id: '8758_296465_000020_000000'
32
- '1034_121119_000028_000001':
33
- speaker_id: '1034'
34
- 'example_id': '1034_121119_000028_000001'
35
  '4957_30119_000070_000001':
36
  speaker_id: '4957'
37
  example_id: '4957_30119_000070_000001'
@@ -56,9 +59,6 @@ dataset:
56
  '5012_80192_000020_000003':
57
  speaker_id: '5012'
58
  example_id: '5012_80192_000020_000003'
59
- '1422_149735_000006_000000':
60
- speaker_id: '1422'
61
- example_id: '1422_149735_000006_000000'
62
  '14_212_000019_000000':
63
  speaker_id: '14'
64
  example_id: '14_212_000019_000000'
 
1
  dataset:
2
+ '1422_149735_000006_000000':
3
+ speaker_id: '1422'
4
+ example_id: '1422_149735_000006_000000'
5
+ '1034_121119_000028_000001':
6
+ speaker_id: '1034'
7
+ 'example_id': '1034_121119_000028_000001'
8
  '7190_90542_000054_000000':
9
  speaker_id: '7190'
10
  example_id: '7190_90542_000054_000000'
 
35
  '8758_296465_000020_000000':
36
  speaker_id: '8758'
37
  example_id: '8758_296465_000020_000000'
 
 
 
38
  '4957_30119_000070_000001':
39
  speaker_id: '4957'
40
  example_id: '4957_30119_000070_000001'
 
59
  '5012_80192_000020_000003':
60
  speaker_id: '5012'
61
  example_id: '5012_80192_000020_000003'
 
 
 
62
  '14_212_000019_000000':
63
  speaker_id: '14'
64
  example_id: '14_212_000019_000000'
app.py CHANGED
@@ -25,6 +25,7 @@ cached_loaded_example = None
25
  cached_labels = None
26
  cached_d_vector = None
27
  cached_unmanipulated = None
 
28
 
29
  # path to stats
30
  stats_path = Path('./Dataset/Embeddings/')
@@ -48,7 +49,7 @@ hubert_model = HubertExtractor(
48
  layer=SID_LARGE_LAYER,
49
  model_name="HUBERT_LARGE",
50
  backend="torchaudio",
51
- device=device,
52
  # storage_dir= # target storage dir hubert model
53
  )
54
 
@@ -166,8 +167,7 @@ def delete_cache():
166
 
167
 
168
  def update_manipulation(manipulation_idx, example_id, transcription, manipulation_fkt):
169
- global cached_example_id, cached_loaded_example, cached_labels, cached_d_vector, example_database, cached_unmanipulated
170
-
171
  speaker_id = dataset_dict['dataset'][example_id]['speaker_id']
172
 
173
  example = {
@@ -189,6 +189,14 @@ def update_manipulation(manipulation_idx, example_id, transcription, manipulatio
189
  'text': transcription,
190
  'd_vector': cached_d_vector.detach().numpy(),
191
  })
 
 
 
 
 
 
 
 
192
 
193
  with torch.no_grad():
194
  wav_manipulated = get_manipulation(
@@ -214,18 +222,20 @@ demo = gr.Interface(
214
  value=2, type="value"
215
  ),
216
  gr.Dropdown(
217
- choices=dataset_dict['dataset'].keys(),
218
- value='1422_149735_000006_000000', type="value"
 
 
219
  ),
220
  gr.Textbox(
221
- value="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
 
222
  placeholder='Type something'
223
  ),
224
- gr.Slider(label="Manipulation Factor", minimum=-1.0, maximum=2.0, value=1.0, step=0.1),
225
  ],
226
- outputs=[gr.Audio(label="original utterance"), gr.Audio(label="manipulated utterance")],
227
  )
228
-
229
  if __name__ == "__main__":
230
  demo.launch(share=True)
231
 
 
25
  cached_labels = None
26
  cached_d_vector = None
27
  cached_unmanipulated = None
28
+ cached_transcription = None
29
 
30
  # path to stats
31
  stats_path = Path('./Dataset/Embeddings/')
 
49
  layer=SID_LARGE_LAYER,
50
  model_name="HUBERT_LARGE",
51
  backend="torchaudio",
52
+ device=device,
53
  # storage_dir= # target storage dir hubert model
54
  )
55
 
 
167
 
168
 
169
  def update_manipulation(manipulation_idx, example_id, transcription, manipulation_fkt):
170
+ global cached_example_id, cached_loaded_example, cached_labels, cached_d_vector, example_database, cached_unmanipulated, cached_transcription
 
171
  speaker_id = dataset_dict['dataset'][example_id]['speaker_id']
172
 
173
  example = {
 
189
  'text': transcription,
190
  'd_vector': cached_d_vector.detach().numpy(),
191
  })
192
+ cached_transcription = transcription
193
+ if cached_loaded_example != example or transcription != cached_transcription:
194
+ with torch.no_grad():
195
+ cached_unmanipulated = tts_model.synthesize_from_example({
196
+ 'text': transcription,
197
+ 'd_vector': cached_d_vector.detach().numpy(),
198
+ })
199
+ cached_transcription = transcription
200
 
201
  with torch.no_grad():
202
  wav_manipulated = get_manipulation(
 
222
  value=2, type="value"
223
  ),
224
  gr.Dropdown(
225
+ label="Speaker",
226
+ choices=[(str(idx), example_id) for idx, example_id in enumerate(dataset_dict['dataset'].keys())],
227
+ value="1422_149735_000006_000000",
228
+ type="value"
229
  ),
230
  gr.Textbox(
231
+ label="Text Input",
232
+ value="Department of Communications Engineering Paderborn University.",
233
  placeholder='Type something'
234
  ),
235
+ gr.Slider(label="Manipulation Intensity", minimum=-1.0, maximum=2.0, value=1.0, step=0.1),
236
  ],
237
+ outputs=[gr.Audio(label="original synthesized utterance"), gr.Audio(label="manipulated synthesized utterance")],
238
  )
 
239
  if __name__ == "__main__":
240
  demo.launch(share=True)
241
 
pvq_manipulation/models/vits.py CHANGED
@@ -246,11 +246,8 @@ class Vits_NT(Vits):
246
  y_mask=y_mask
247
  )
248
 
249
- import time
250
- start = time.time()
251
  if not torch.cuda.is_available():
252
  num_chunks = min(os.cpu_count() or 2, z.shape[-1])
253
- print(num_chunks, 'num chunks')
254
  chunk_size = z.shape[-1] // num_chunks
255
  z_chunks = torch.split(z, chunk_size, dim=-1)
256
 
@@ -271,8 +268,6 @@ class Vits_NT(Vits):
271
  (z * y_mask)[:, :, : self.max_inference_len],
272
  g=speaker_embedding_man[:, :, None] if self.config.gan_speaker_conditioning else None
273
  )
274
-
275
- print(time.time() - start)
276
  return o
277
 
278
  def forward(self, x, x_lengths, y, y_lengths, aux_input, inference=False):
 
246
  y_mask=y_mask
247
  )
248
 
 
 
249
  if not torch.cuda.is_available():
250
  num_chunks = min(os.cpu_count() or 2, z.shape[-1])
 
251
  chunk_size = z.shape[-1] // num_chunks
252
  z_chunks = torch.split(z, chunk_size, dim=-1)
253
 
 
268
  (z * y_mask)[:, :, : self.max_inference_len],
269
  g=speaker_embedding_man[:, :, None] if self.config.gan_speaker_conditioning else None
270
  )
 
 
271
  return o
272
 
273
  def forward(self, x, x_lengths, y, y_lengths, aux_input, inference=False):