Spaces:

fgnt-upb
/

pvq_manipulation

Running

App Files Files Community

FrederikRautenberg commited on 7 days ago

Commit

d161181

1 Parent(s): 7efb86f

Rename Sliders and refactor text input

Browse files

Files changed (3) hide show

Dataset/dataset.yaml +6 -6
app.py +19 -9
pvq_manipulation/models/vits.py +0 -5

Dataset/dataset.yaml CHANGED Viewed

@@ -1,4 +1,10 @@
 dataset:
   '7190_90542_000054_000000':
     speaker_id: '7190'
     example_id: '7190_90542_000054_000000'
@@ -29,9 +35,6 @@ dataset:
   '8758_296465_000020_000000':
     speaker_id: '8758'
     example_id: '8758_296465_000020_000000'
-  '1034_121119_000028_000001':
-    speaker_id: '1034'
-    'example_id': '1034_121119_000028_000001'
   '4957_30119_000070_000001':
     speaker_id: '4957'
     example_id: '4957_30119_000070_000001'
@@ -56,9 +59,6 @@ dataset:
   '5012_80192_000020_000003':
     speaker_id: '5012'
     example_id: '5012_80192_000020_000003'
-  '1422_149735_000006_000000':
-    speaker_id: '1422'
-    example_id: '1422_149735_000006_000000'
   '14_212_000019_000000':
     speaker_id: '14'
     example_id: '14_212_000019_000000'

 dataset:
+  '1422_149735_000006_000000':
+    speaker_id: '1422'
+    example_id: '1422_149735_000006_000000'
+  '1034_121119_000028_000001':
+    speaker_id: '1034'
+    'example_id': '1034_121119_000028_000001'
   '7190_90542_000054_000000':
     speaker_id: '7190'
     example_id: '7190_90542_000054_000000'
   '8758_296465_000020_000000':
     speaker_id: '8758'
     example_id: '8758_296465_000020_000000'
   '4957_30119_000070_000001':
     speaker_id: '4957'
     example_id: '4957_30119_000070_000001'
   '5012_80192_000020_000003':
     speaker_id: '5012'
     example_id: '5012_80192_000020_000003'
   '14_212_000019_000000':
     speaker_id: '14'
     example_id: '14_212_000019_000000'

app.py CHANGED Viewed

@@ -25,6 +25,7 @@ cached_loaded_example = None
 cached_labels = None
 cached_d_vector = None
 cached_unmanipulated = None
 # path to stats
 stats_path = Path('./Dataset/Embeddings/')
@@ -48,7 +49,7 @@ hubert_model = HubertExtractor(
     layer=SID_LARGE_LAYER,
     model_name="HUBERT_LARGE",
     backend="torchaudio",
-    device=device,
     # storage_dir= # target storage dir hubert model
 )
@@ -166,8 +167,7 @@ def delete_cache():
 def update_manipulation(manipulation_idx, example_id, transcription, manipulation_fkt):
-    global cached_example_id, cached_loaded_example, cached_labels, cached_d_vector, example_database, cached_unmanipulated
     speaker_id = dataset_dict['dataset'][example_id]['speaker_id']
     example = {
@@ -189,6 +189,14 @@ def update_manipulation(manipulation_idx, example_id, transcription, manipulatio
                 'text': transcription,
                 'd_vector': cached_d_vector.detach().numpy(),
             })
     with torch.no_grad():
         wav_manipulated = get_manipulation(
@@ -214,18 +222,20 @@ demo = gr.Interface(
             value=2, type="value"
         ),
         gr.Dropdown(
-            choices=dataset_dict['dataset'].keys(),
-            value='1422_149735_000006_000000', type="value"
         ),
         gr.Textbox(
-            value="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
             placeholder='Type something'
         ),
-        gr.Slider(label="Manipulation Factor", minimum=-1.0, maximum=2.0, value=1.0, step=0.1),
     ],
-    outputs=[gr.Audio(label="original utterance"), gr.Audio(label="manipulated utterance")],
 )
 if __name__ == "__main__":
     demo.launch(share=True)

 cached_labels = None
 cached_d_vector = None
 cached_unmanipulated = None
+cached_transcription = None
 # path to stats
 stats_path = Path('./Dataset/Embeddings/')
     layer=SID_LARGE_LAYER,
     model_name="HUBERT_LARGE",
     backend="torchaudio",
+    device=device,
     # storage_dir= # target storage dir hubert model
 )
 def update_manipulation(manipulation_idx, example_id, transcription, manipulation_fkt):
+    global cached_example_id, cached_loaded_example, cached_labels, cached_d_vector, example_database, cached_unmanipulated, cached_transcription
     speaker_id = dataset_dict['dataset'][example_id]['speaker_id']
     example = {
                 'text': transcription,
                 'd_vector': cached_d_vector.detach().numpy(),
             })
+        cached_transcription = transcription
+    if cached_loaded_example != example or transcription != cached_transcription:
+        with torch.no_grad():
+            cached_unmanipulated = tts_model.synthesize_from_example({
+                'text': transcription,
+                'd_vector': cached_d_vector.detach().numpy(),
+            })
+        cached_transcription = transcription
     with torch.no_grad():
         wav_manipulated = get_manipulation(
             value=2, type="value"
         ),
         gr.Dropdown(
+            label="Speaker",
+            choices=[(str(idx), example_id) for idx, example_id in enumerate(dataset_dict['dataset'].keys())],
+            value="1422_149735_000006_000000",
+            type="value"
         ),
         gr.Textbox(
+            label="Text Input",
+            value="Department of Communications Engineering Paderborn University.",
             placeholder='Type something'
         ),
+        gr.Slider(label="Manipulation Intensity", minimum=-1.0, maximum=2.0, value=1.0, step=0.1),
     ],
+    outputs=[gr.Audio(label="original synthesized utterance"), gr.Audio(label="manipulated synthesized utterance")],
 )
 if __name__ == "__main__":
     demo.launch(share=True)

pvq_manipulation/models/vits.py CHANGED Viewed

@@ -246,11 +246,8 @@ class Vits_NT(Vits):
             y_mask=y_mask
         )
-        import time
-        start = time.time()
         if not torch.cuda.is_available():
             num_chunks = min(os.cpu_count() or 2, z.shape[-1])
-            print(num_chunks, 'num chunks')
             chunk_size = z.shape[-1] // num_chunks
             z_chunks = torch.split(z, chunk_size, dim=-1)
@@ -271,8 +268,6 @@ class Vits_NT(Vits):
                 (z * y_mask)[:, :, : self.max_inference_len],
                 g=speaker_embedding_man[:, :, None] if self.config.gan_speaker_conditioning else None
             )
-        print(time.time() - start)
         return o
     def forward(self, x, x_lengths, y, y_lengths, aux_input, inference=False):

             y_mask=y_mask
         )
         if not torch.cuda.is_available():
             num_chunks = min(os.cpu_count() or 2, z.shape[-1])
             chunk_size = z.shape[-1] // num_chunks
             z_chunks = torch.split(z, chunk_size, dim=-1)
                 (z * y_mask)[:, :, : self.max_inference_len],
                 g=speaker_embedding_man[:, :, None] if self.config.gan_speaker_conditioning else None
             )
         return o
     def forward(self, x, x_lengths, y, y_lengths, aux_input, inference=False):