Spaces:
Running
Running
Commit
·
d161181
1
Parent(s):
7efb86f
Rename Sliders and refactor text input
Browse files- Dataset/dataset.yaml +6 -6
- app.py +19 -9
- pvq_manipulation/models/vits.py +0 -5
Dataset/dataset.yaml
CHANGED
@@ -1,4 +1,10 @@
|
|
1 |
dataset:
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
'7190_90542_000054_000000':
|
3 |
speaker_id: '7190'
|
4 |
example_id: '7190_90542_000054_000000'
|
@@ -29,9 +35,6 @@ dataset:
|
|
29 |
'8758_296465_000020_000000':
|
30 |
speaker_id: '8758'
|
31 |
example_id: '8758_296465_000020_000000'
|
32 |
-
'1034_121119_000028_000001':
|
33 |
-
speaker_id: '1034'
|
34 |
-
'example_id': '1034_121119_000028_000001'
|
35 |
'4957_30119_000070_000001':
|
36 |
speaker_id: '4957'
|
37 |
example_id: '4957_30119_000070_000001'
|
@@ -56,9 +59,6 @@ dataset:
|
|
56 |
'5012_80192_000020_000003':
|
57 |
speaker_id: '5012'
|
58 |
example_id: '5012_80192_000020_000003'
|
59 |
-
'1422_149735_000006_000000':
|
60 |
-
speaker_id: '1422'
|
61 |
-
example_id: '1422_149735_000006_000000'
|
62 |
'14_212_000019_000000':
|
63 |
speaker_id: '14'
|
64 |
example_id: '14_212_000019_000000'
|
|
|
1 |
dataset:
|
2 |
+
'1422_149735_000006_000000':
|
3 |
+
speaker_id: '1422'
|
4 |
+
example_id: '1422_149735_000006_000000'
|
5 |
+
'1034_121119_000028_000001':
|
6 |
+
speaker_id: '1034'
|
7 |
+
'example_id': '1034_121119_000028_000001'
|
8 |
'7190_90542_000054_000000':
|
9 |
speaker_id: '7190'
|
10 |
example_id: '7190_90542_000054_000000'
|
|
|
35 |
'8758_296465_000020_000000':
|
36 |
speaker_id: '8758'
|
37 |
example_id: '8758_296465_000020_000000'
|
|
|
|
|
|
|
38 |
'4957_30119_000070_000001':
|
39 |
speaker_id: '4957'
|
40 |
example_id: '4957_30119_000070_000001'
|
|
|
59 |
'5012_80192_000020_000003':
|
60 |
speaker_id: '5012'
|
61 |
example_id: '5012_80192_000020_000003'
|
|
|
|
|
|
|
62 |
'14_212_000019_000000':
|
63 |
speaker_id: '14'
|
64 |
example_id: '14_212_000019_000000'
|
app.py
CHANGED
@@ -25,6 +25,7 @@ cached_loaded_example = None
|
|
25 |
cached_labels = None
|
26 |
cached_d_vector = None
|
27 |
cached_unmanipulated = None
|
|
|
28 |
|
29 |
# path to stats
|
30 |
stats_path = Path('./Dataset/Embeddings/')
|
@@ -48,7 +49,7 @@ hubert_model = HubertExtractor(
|
|
48 |
layer=SID_LARGE_LAYER,
|
49 |
model_name="HUBERT_LARGE",
|
50 |
backend="torchaudio",
|
51 |
-
device=device,
|
52 |
# storage_dir= # target storage dir hubert model
|
53 |
)
|
54 |
|
@@ -166,8 +167,7 @@ def delete_cache():
|
|
166 |
|
167 |
|
168 |
def update_manipulation(manipulation_idx, example_id, transcription, manipulation_fkt):
|
169 |
-
global cached_example_id, cached_loaded_example, cached_labels, cached_d_vector, example_database, cached_unmanipulated
|
170 |
-
|
171 |
speaker_id = dataset_dict['dataset'][example_id]['speaker_id']
|
172 |
|
173 |
example = {
|
@@ -189,6 +189,14 @@ def update_manipulation(manipulation_idx, example_id, transcription, manipulatio
|
|
189 |
'text': transcription,
|
190 |
'd_vector': cached_d_vector.detach().numpy(),
|
191 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
|
193 |
with torch.no_grad():
|
194 |
wav_manipulated = get_manipulation(
|
@@ -214,18 +222,20 @@ demo = gr.Interface(
|
|
214 |
value=2, type="value"
|
215 |
),
|
216 |
gr.Dropdown(
|
217 |
-
|
218 |
-
|
|
|
|
|
219 |
),
|
220 |
gr.Textbox(
|
221 |
-
|
|
|
222 |
placeholder='Type something'
|
223 |
),
|
224 |
-
gr.Slider(label="Manipulation
|
225 |
],
|
226 |
-
outputs=[gr.Audio(label="original utterance"), gr.Audio(label="manipulated utterance")],
|
227 |
)
|
228 |
-
|
229 |
if __name__ == "__main__":
|
230 |
demo.launch(share=True)
|
231 |
|
|
|
25 |
cached_labels = None
|
26 |
cached_d_vector = None
|
27 |
cached_unmanipulated = None
|
28 |
+
cached_transcription = None
|
29 |
|
30 |
# path to stats
|
31 |
stats_path = Path('./Dataset/Embeddings/')
|
|
|
49 |
layer=SID_LARGE_LAYER,
|
50 |
model_name="HUBERT_LARGE",
|
51 |
backend="torchaudio",
|
52 |
+
device=device,
|
53 |
# storage_dir= # target storage dir hubert model
|
54 |
)
|
55 |
|
|
|
167 |
|
168 |
|
169 |
def update_manipulation(manipulation_idx, example_id, transcription, manipulation_fkt):
|
170 |
+
global cached_example_id, cached_loaded_example, cached_labels, cached_d_vector, example_database, cached_unmanipulated, cached_transcription
|
|
|
171 |
speaker_id = dataset_dict['dataset'][example_id]['speaker_id']
|
172 |
|
173 |
example = {
|
|
|
189 |
'text': transcription,
|
190 |
'd_vector': cached_d_vector.detach().numpy(),
|
191 |
})
|
192 |
+
cached_transcription = transcription
|
193 |
+
if cached_loaded_example != example or transcription != cached_transcription:
|
194 |
+
with torch.no_grad():
|
195 |
+
cached_unmanipulated = tts_model.synthesize_from_example({
|
196 |
+
'text': transcription,
|
197 |
+
'd_vector': cached_d_vector.detach().numpy(),
|
198 |
+
})
|
199 |
+
cached_transcription = transcription
|
200 |
|
201 |
with torch.no_grad():
|
202 |
wav_manipulated = get_manipulation(
|
|
|
222 |
value=2, type="value"
|
223 |
),
|
224 |
gr.Dropdown(
|
225 |
+
label="Speaker",
|
226 |
+
choices=[(str(idx), example_id) for idx, example_id in enumerate(dataset_dict['dataset'].keys())],
|
227 |
+
value="1422_149735_000006_000000",
|
228 |
+
type="value"
|
229 |
),
|
230 |
gr.Textbox(
|
231 |
+
label="Text Input",
|
232 |
+
value="Department of Communications Engineering Paderborn University.",
|
233 |
placeholder='Type something'
|
234 |
),
|
235 |
+
gr.Slider(label="Manipulation Intensity", minimum=-1.0, maximum=2.0, value=1.0, step=0.1),
|
236 |
],
|
237 |
+
outputs=[gr.Audio(label="original synthesized utterance"), gr.Audio(label="manipulated synthesized utterance")],
|
238 |
)
|
|
|
239 |
if __name__ == "__main__":
|
240 |
demo.launch(share=True)
|
241 |
|
pvq_manipulation/models/vits.py
CHANGED
@@ -246,11 +246,8 @@ class Vits_NT(Vits):
|
|
246 |
y_mask=y_mask
|
247 |
)
|
248 |
|
249 |
-
import time
|
250 |
-
start = time.time()
|
251 |
if not torch.cuda.is_available():
|
252 |
num_chunks = min(os.cpu_count() or 2, z.shape[-1])
|
253 |
-
print(num_chunks, 'num chunks')
|
254 |
chunk_size = z.shape[-1] // num_chunks
|
255 |
z_chunks = torch.split(z, chunk_size, dim=-1)
|
256 |
|
@@ -271,8 +268,6 @@ class Vits_NT(Vits):
|
|
271 |
(z * y_mask)[:, :, : self.max_inference_len],
|
272 |
g=speaker_embedding_man[:, :, None] if self.config.gan_speaker_conditioning else None
|
273 |
)
|
274 |
-
|
275 |
-
print(time.time() - start)
|
276 |
return o
|
277 |
|
278 |
def forward(self, x, x_lengths, y, y_lengths, aux_input, inference=False):
|
|
|
246 |
y_mask=y_mask
|
247 |
)
|
248 |
|
|
|
|
|
249 |
if not torch.cuda.is_available():
|
250 |
num_chunks = min(os.cpu_count() or 2, z.shape[-1])
|
|
|
251 |
chunk_size = z.shape[-1] // num_chunks
|
252 |
z_chunks = torch.split(z, chunk_size, dim=-1)
|
253 |
|
|
|
268 |
(z * y_mask)[:, :, : self.max_inference_len],
|
269 |
g=speaker_embedding_man[:, :, None] if self.config.gan_speaker_conditioning else None
|
270 |
)
|
|
|
|
|
271 |
return o
|
272 |
|
273 |
def forward(self, x, x_lengths, y, y_lengths, aux_input, inference=False):
|