FrederikRautenberg commited on
Commit
4732065
·
1 Parent(s): 80d202c

Add creak manipulation

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. Dataset/Audio_files/1034_121119_000028_000001.wav +3 -0
  3. Dataset/Audio_files/1088_129236_000006_000007.wav +3 -0
  4. Dataset/Audio_files/1422_149735_000006_000000.wav +3 -0
  5. Dataset/Audio_files/14_212_000019_000000.wav +3 -0
  6. Dataset/Audio_files/1535_141644_000004_000001.wav +3 -0
  7. Dataset/Audio_files/1731_142320_000122_000005.wav +3 -0
  8. Dataset/Audio_files/3009_10327_000027_000005.wav +3 -0
  9. Dataset/Audio_files/329_861_000024_000003.wav +3 -0
  10. Dataset/Audio_files/4830_25904_000008_000001.wav +3 -0
  11. Dataset/Audio_files/4957_30119_000070_000001.wav +3 -0
  12. Dataset/Audio_files/5012_80192_000020_000003.wav +3 -0
  13. Dataset/Audio_files/5802_76044_000038_000000.wav +3 -0
  14. Dataset/Audio_files/6544_71420_000024_000001.wav +3 -0
  15. Dataset/Audio_files/6918_47541_000006_000008.wav +3 -0
  16. Dataset/Audio_files/7011_66622_000032_000002.wav +3 -0
  17. Dataset/Audio_files/7059_77897_000017_000001.wav +3 -0
  18. Dataset/Audio_files/7190_90542_000054_000000.wav +3 -0
  19. Dataset/Audio_files/7226_86965_000020_000001.wav +3 -0
  20. Dataset/Audio_files/7245_104888_000016_000000.wav +3 -0
  21. Dataset/Audio_files/83_9960_000017_000003.wav +3 -0
  22. Dataset/Audio_files/8758_296465_000020_000000.wav +3 -0
  23. Dataset/Audio_files/8820_294120_000011_000001.wav +3 -0
  24. Dataset/Embeddings/1034/1034_121119_000028_000001.pth +3 -0
  25. Dataset/Embeddings/1088/1088_129236_000006_000007.pth +3 -0
  26. Dataset/Embeddings/14/14_212_000019_000000.pth +3 -0
  27. Dataset/Embeddings/1422/1422_149735_000006_000000.pth +3 -0
  28. Dataset/Embeddings/1535/1535_141644_000004_000001.pth +3 -0
  29. Dataset/Embeddings/1731/1731_142320_000122_000005.pth +3 -0
  30. Dataset/Embeddings/3009/3009_10327_000027_000005.pth +3 -0
  31. Dataset/Embeddings/329/329_861_000024_000003.pth +3 -0
  32. Dataset/Embeddings/4830/4830_25904_000008_000001.pth +3 -0
  33. Dataset/Embeddings/4957/4957_30119_000070_000001.pth +3 -0
  34. Dataset/Embeddings/5012/5012_80192_000020_000003.pth +3 -0
  35. Dataset/Embeddings/5802/5802_76044_000038_000000.pth +3 -0
  36. Dataset/Embeddings/6544/6544_71420_000024_000001.pth +3 -0
  37. Dataset/Embeddings/6918/6918_47541_000006_000008.pth +3 -0
  38. Dataset/Embeddings/7011/7011_66622_000032_000002.pth +3 -0
  39. Dataset/Embeddings/7059/7059_77897_000017_000001.pth +3 -0
  40. Dataset/Embeddings/7190/7190_90542_000054_000000.pth +3 -0
  41. Dataset/Embeddings/7226/7226_86965_000020_000001.pth +3 -0
  42. Dataset/Embeddings/7245/7245_104888_000016_000000.pth +3 -0
  43. Dataset/Embeddings/83/83_9960_000017_000003.pth +3 -0
  44. Dataset/Embeddings/8758/8758_296465_000020_000000.pth +3 -0
  45. Dataset/Embeddings/8820/8820_294120_000011_000001.pth +3 -0
  46. Dataset/Embeddings/mean.json +258 -0
  47. Dataset/Embeddings/std.json +258 -0
  48. Dataset/dataset.yaml +67 -0
  49. app.py +123 -98
  50. models/norm_flow/config.json +14 -11
.gitattributes CHANGED
@@ -44,3 +44,5 @@ models/pvq_extractor/Resonance.onnx filter=lfs diff=lfs merge=lfs -text
44
  models/pvq_extractor/Weight.onnx filter=lfs diff=lfs merge=lfs -text
45
  models/norm_flow/model.pt filter=lfs diff=lfs merge=lfs -text
46
  audio/1034_121119_000028_000001.wav filter=lfs diff=lfs merge=lfs -text
 
 
 
44
  models/pvq_extractor/Weight.onnx filter=lfs diff=lfs merge=lfs -text
45
  models/norm_flow/model.pt filter=lfs diff=lfs merge=lfs -text
46
  audio/1034_121119_000028_000001.wav filter=lfs diff=lfs merge=lfs -text
47
+ Dataset/Audio_files/*.wav filter=lfs diff=lfs merge=lfs -text
48
+ Dataset/Embeddings/**/*.pth filter=lfs diff=lfs merge=lfs -text
Dataset/Audio_files/1034_121119_000028_000001.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc00c4e893ccf708cae4366e36ede93b4e158f516323a0724fc6e9f956c76aff
3
+ size 385964
Dataset/Audio_files/1088_129236_000006_000007.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27b2e7191ba1cfad41bc1ab1bd09ec1af87062e48abbab1ef01809c76ed738da
3
+ size 311084
Dataset/Audio_files/1422_149735_000006_000000.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f59d7f7a3c7364d7ac254bd94d3384e9b8e173634eb8b7492ec751d8584f8bb5
3
+ size 345644
Dataset/Audio_files/14_212_000019_000000.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1271f49cf4855d1b7d9b87e99a0c79e5505acbfba94cd8f594c1df2a29d96027
3
+ size 633652
Dataset/Audio_files/1535_141644_000004_000001.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cc6f0bc3b9ebecbc1dab5a430c37140337a6bbeaf6f75103d74b2b4e75b4f06
3
+ size 295724
Dataset/Audio_files/1731_142320_000122_000005.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3d3e88bc53ee1cad73100ea4ad6ccc6d9bcbc36145962d400122b658e27b7e8
3
+ size 316844
Dataset/Audio_files/3009_10327_000027_000005.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbe46722ef2d331a5bc1c552cd6ad3c8a69022a3c70b1c03b609856dc073ca32
3
+ size 309164
Dataset/Audio_files/329_861_000024_000003.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7dab66421315b9f22bbbaf909e69184c01eaba29e536c2b449c8a7310f2edce7
3
+ size 261164
Dataset/Audio_files/4830_25904_000008_000001.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd41aaf86c9d6e394d9afcca5e3128aa6a52fd2948e3bcf6aa03e5c18f2c7eec
3
+ size 483884
Dataset/Audio_files/4957_30119_000070_000001.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f483b0a7003610ba8451db035f3347b156bb348c7aa356b7403f8ca86b98ab28
3
+ size 503084
Dataset/Audio_files/5012_80192_000020_000003.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91aa9243a6697d65e6f5464b40e9b420b5e5cdef83b64a5556baef1ac548f11e
3
+ size 409004
Dataset/Audio_files/5802_76044_000038_000000.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22964325ee8f751dddd136b3219191443270529d95ee27b45c4a789501286492
3
+ size 460844
Dataset/Audio_files/6544_71420_000024_000001.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1289b6714acb263b8bb36d6acfbb4efded0a5c67cc9b6a6246340dd3493c6c2b
3
+ size 209324
Dataset/Audio_files/6918_47541_000006_000008.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dcd7955f5469755038482a58f0929012526f98130513acd9d0cd1bc208bbfe8
3
+ size 898612
Dataset/Audio_files/7011_66622_000032_000002.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:111c88a591efa42a608d1609214e6ef56a64f3bd79a88b57efecba2ca2f7ed4c
3
+ size 309164
Dataset/Audio_files/7059_77897_000017_000001.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17f51a3d2133e81607e36403b6bcb8bd7ec9e03c1bcfbbc80b4123c1b31d6618
3
+ size 243884
Dataset/Audio_files/7190_90542_000054_000000.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34bdcfbdf51f3475465804b9dbf27f8e647ccc1af17573b0a923f44881217093
3
+ size 222764
Dataset/Audio_files/7226_86965_000020_000001.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b824a36ceaeec45724088957b1e543ee3b477ca1ee55e4c55e96ac8c2b018fb5
3
+ size 622132
Dataset/Audio_files/7245_104888_000016_000000.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92d74fe5965fff3182cb1f273c80ea051033c7aa0dbbb44ae48ccded15210216
3
+ size 341804
Dataset/Audio_files/83_9960_000017_000003.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a1cdc21d779c1d108af86ec6a93558a501322a67c221c25e2dd32d93e0c356a
3
+ size 192044
Dataset/Audio_files/8758_296465_000020_000000.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8adafe1ab7b3e86c82454c06863dd616c5b52f91ebb8690fcc64ec7abb2821dc
3
+ size 520364
Dataset/Audio_files/8820_294120_000011_000001.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b78d6b2aed1a99e8f3750bd54c50e8ed2e08dba114792fa604101faf27894708
3
+ size 213164
Dataset/Embeddings/1034/1034_121119_000028_000001.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4a7bdd020bf0da6fb08d272448c8b61c6f065e529084ce1cf9c39c1636e017c
3
+ size 2358
Dataset/Embeddings/1088/1088_129236_000006_000007.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1544023ea8afb9b0c71fa31e1e16d2ec510cf9d8637a64648941448c9e5e18ae
3
+ size 2358
Dataset/Embeddings/14/14_212_000019_000000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f248135ffdacc81ef4b5071f564448d49c2341b5c5c14bf4257af633f9318fd
3
+ size 2269
Dataset/Embeddings/1422/1422_149735_000006_000000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fee0b79857cf8ce499a658dfeb5137d5b4fa7e849dd8118c32028391b88b3d08
3
+ size 2358
Dataset/Embeddings/1535/1535_141644_000004_000001.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d71a10862f81b5a3f0877f6eb26cd4bc733e0cc9868acc65a65bb23ffe304b9
3
+ size 2358
Dataset/Embeddings/1731/1731_142320_000122_000005.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79f2de5cb40487fe19b5099b57fe0a41f0436f554019c619bcb4cd9d6c64bf36
3
+ size 2358
Dataset/Embeddings/3009/3009_10327_000027_000005.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f24187ad06ecbe02df165538c6881192cfd055b5a3cc5ab1348d2c05d6567421
3
+ size 2353
Dataset/Embeddings/329/329_861_000024_000003.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc001f48b3f67d25192967e0a297dc1787144e36222e5b83a71ae6f5b89be9b3
3
+ size 2274
Dataset/Embeddings/4830/4830_25904_000008_000001.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:170bdd680d0735a19c5e88e01fc8bf84dac623d7c73eebeff6e99974b8e9d081
3
+ size 2353
Dataset/Embeddings/4957/4957_30119_000070_000001.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c922316a446bcc28db8a43f768ade2b2113ce0f6fab24b60b396f67264ce07c8
3
+ size 2353
Dataset/Embeddings/5012/5012_80192_000020_000003.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba1c17f5100b1e0147e9c96d864cc054e8840a15cd46307e191fbe88a728b1b0
3
+ size 2353
Dataset/Embeddings/5802/5802_76044_000038_000000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7656515f537fa6de193f40d78c9747cfb1268266d3dd88a22a41ce2c3a28514a
3
+ size 2353
Dataset/Embeddings/6544/6544_71420_000024_000001.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f70c9bd92dea6ddfd495c7ab32cae30494eaf3b42f6d6533ff9f55de80593f05
3
+ size 2353
Dataset/Embeddings/6918/6918_47541_000006_000008.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed49a76c226606f98ce4c2db2aac937354e40cc8fb789e29e93aa87f64bc01d1
3
+ size 2353
Dataset/Embeddings/7011/7011_66622_000032_000002.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43e63641af7d4322b89489acb9c10cfc7e71961bd6479c55c17135b3ecfa5605
3
+ size 2353
Dataset/Embeddings/7059/7059_77897_000017_000001.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:688e50692694cccbe5f61c8780e0980509118f4061a44180ec8dffff2d963921
3
+ size 2353
Dataset/Embeddings/7190/7190_90542_000054_000000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f1925fcb8ce5ffa8b9223de17ea8d98c0abb24409852208f03c607374c9f60a
3
+ size 2353
Dataset/Embeddings/7226/7226_86965_000020_000001.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f88a82eee39139ab65f3b201f2657b795ad66d70ccd637f903d537df2acaca0
3
+ size 2353
Dataset/Embeddings/7245/7245_104888_000016_000000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db50270eb45aab4344720a1da44d3c9d91ace10e69514287b3174ba9c2ca208a
3
+ size 2358
Dataset/Embeddings/83/83_9960_000017_000003.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:958832b7e4e77f6eb8343b91091c8603b683b25c03f242e6de4b09952a0fba6d
3
+ size 2274
Dataset/Embeddings/8758/8758_296465_000020_000000.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:465be53ae1d0a44ccdb90e0fcaccf09a0ae91041f984ef18f606df0169ea8f3e
3
+ size 2358
Dataset/Embeddings/8820/8820_294120_000011_000001.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b79ee0d4796df0776bc0ddfc8683f2a025c4829893f28b3cff6b4a2d5405d968
3
+ size 2358
Dataset/Embeddings/mean.json ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ 0.21412190794944763,
3
+ 0.18206638097763062,
4
+ 0.11840786784887314,
5
+ 0.09126990288496017,
6
+ 0.04086871072649956,
7
+ -0.149668350815773,
8
+ 0.2645065188407898,
9
+ 0.27953410148620605,
10
+ 0.6700411438941956,
11
+ -0.06264923512935638,
12
+ 0.2915269732475281,
13
+ 0.12102372199296951,
14
+ -0.5578641891479492,
15
+ -0.12462181597948074,
16
+ 0.6190101504325867,
17
+ -0.5761605501174927,
18
+ -0.084229975938797,
19
+ -0.0006869725184515119,
20
+ 0.49899742007255554,
21
+ -0.21737882494926453,
22
+ -0.05707789212465286,
23
+ -0.18819154798984528,
24
+ -0.5531325340270996,
25
+ 0.22641371190547943,
26
+ 0.07952054589986801,
27
+ 0.09851367026567459,
28
+ 0.03574512526392937,
29
+ -0.13013364374637604,
30
+ -0.35363155603408813,
31
+ 0.49086689949035645,
32
+ 0.08895495533943176,
33
+ 0.36905843019485474,
34
+ -0.10707297921180725,
35
+ -0.11953406780958176,
36
+ 0.043051160871982574,
37
+ 0.09323996305465698,
38
+ -0.16280269622802734,
39
+ -0.13945965468883514,
40
+ 0.2095673531293869,
41
+ 0.09729334712028503,
42
+ 0.040950167924165726,
43
+ -0.37764972448349,
44
+ -0.018613651394844055,
45
+ -0.581308901309967,
46
+ -0.4080854058265686,
47
+ -0.42118221521377563,
48
+ 1.0161728858947754,
49
+ -0.19709929823875427,
50
+ -0.024254681542515755,
51
+ 0.04121233895421028,
52
+ -0.15502692759037018,
53
+ 0.7614311575889587,
54
+ -0.6833258271217346,
55
+ 0.33979618549346924,
56
+ 0.49055442214012146,
57
+ 0.011953921988606453,
58
+ 0.4490082263946533,
59
+ 0.2667522728443146,
60
+ -0.6408993005752563,
61
+ -0.17682728171348572,
62
+ 0.12336420267820358,
63
+ 0.1474267542362213,
64
+ -0.11565382778644562,
65
+ 0.6467825174331665,
66
+ 0.10751526057720184,
67
+ -0.14141449332237244,
68
+ 0.6352338194847107,
69
+ -0.04154682531952858,
70
+ 0.12760530412197113,
71
+ -0.6243913769721985,
72
+ 0.08836925774812698,
73
+ 0.28105032444000244,
74
+ -0.15209053456783295,
75
+ -0.0037005548365414143,
76
+ 0.3098902106285095,
77
+ 0.150644913315773,
78
+ 0.07396118342876434,
79
+ -0.049714382737874985,
80
+ -0.5445783138275146,
81
+ -0.033714842051267624,
82
+ 0.1200188472867012,
83
+ -0.2312866747379303,
84
+ 0.20238173007965088,
85
+ -0.5392364263534546,
86
+ -0.40682801604270935,
87
+ -0.16234233975410461,
88
+ -0.6470288634300232,
89
+ -0.1738162636756897,
90
+ 0.25936004519462585,
91
+ -0.15742169320583344,
92
+ 0.24468930065631866,
93
+ 0.13714095950126648,
94
+ 0.1449803113937378,
95
+ 0.16882915794849396,
96
+ 0.19944046437740326,
97
+ -0.29332247376441956,
98
+ 0.0026240404695272446,
99
+ 0.03341501206159592,
100
+ 0.01569036766886711,
101
+ -0.4688950777053833,
102
+ 0.09352052956819534,
103
+ 0.13269393146038055,
104
+ 0.06116529926657677,
105
+ -0.06562789529561996,
106
+ -0.23961076140403748,
107
+ -0.22402845323085785,
108
+ 0.47103151679039,
109
+ 0.0728374496102333,
110
+ -0.561316192150116,
111
+ 0.46127453446388245,
112
+ 0.15431830286979675,
113
+ 0.08550310134887695,
114
+ -0.03363621234893799,
115
+ 0.04015417397022247,
116
+ -0.014262784272432327,
117
+ 0.08499719202518463,
118
+ -0.39322608709335327,
119
+ 0.27674373984336853,
120
+ 0.24571490287780762,
121
+ -0.2642858326435089,
122
+ -0.7408877015113831,
123
+ 0.21007885038852692,
124
+ 0.5898057222366333,
125
+ 0.14988923072814941,
126
+ -0.07782910019159317,
127
+ 0.4078785479068756,
128
+ 0.3004123270511627,
129
+ 0.6256987452507019,
130
+ -0.21651767194271088,
131
+ -0.17712117731571198,
132
+ -0.2749980688095093,
133
+ 0.4826784133911133,
134
+ 0.3035520911216736,
135
+ 0.23235619068145752,
136
+ -0.061135340481996536,
137
+ 0.49035653471946716,
138
+ -0.16356635093688965,
139
+ -0.35920438170433044,
140
+ 0.023298246785998344,
141
+ 0.015880409628152847,
142
+ -0.015357445925474167,
143
+ -0.3540240228176117,
144
+ 0.44811102747917175,
145
+ -0.05202110856771469,
146
+ -0.19488674402236938,
147
+ 0.4875786602497101,
148
+ -0.03857485204935074,
149
+ 0.463600754737854,
150
+ -0.07009128481149673,
151
+ 0.29871219396591187,
152
+ -0.35601672530174255,
153
+ 0.5102726817131042,
154
+ 0.3902379274368286,
155
+ 0.3692609369754791,
156
+ -0.35389819741249084,
157
+ 0.07650414854288101,
158
+ -0.63330078125,
159
+ 0.5580229759216309,
160
+ 0.10672216862440109,
161
+ 0.10609150677919388,
162
+ 0.45468848943710327,
163
+ 0.15291742980480194,
164
+ 0.36706316471099854,
165
+ -0.2831500768661499,
166
+ -0.14291781187057495,
167
+ -0.17804013192653656,
168
+ -0.5424429178237915,
169
+ -0.15468499064445496,
170
+ 0.07343851029872894,
171
+ 0.5380398631095886,
172
+ 0.44494226574897766,
173
+ 0.9300274848937988,
174
+ -0.0274032074958086,
175
+ 0.3488404154777527,
176
+ -0.23694315552711487,
177
+ -0.2424279898405075,
178
+ -0.04125871881842613,
179
+ 0.06136211380362511,
180
+ -0.5118930339813232,
181
+ -0.15055209398269653,
182
+ 0.45361533761024475,
183
+ 0.12657225131988525,
184
+ 0.34210655093193054,
185
+ 0.313772052526474,
186
+ -0.3521589934825897,
187
+ 0.05892332270741463,
188
+ -0.11534406244754791,
189
+ 0.514985203742981,
190
+ 0.054903097450733185,
191
+ 0.18034562468528748,
192
+ 0.26060545444488525,
193
+ -0.29317837953567505,
194
+ 0.1423174887895584,
195
+ 0.25360995531082153,
196
+ -0.47162681818008423,
197
+ 0.5438259243965149,
198
+ 0.02562086470425129,
199
+ 0.020302919670939445,
200
+ 0.3039097189903259,
201
+ 0.19996808469295502,
202
+ 0.3423006236553192,
203
+ 0.4524010717868805,
204
+ -0.3152591586112976,
205
+ -0.60369873046875,
206
+ 0.16421166062355042,
207
+ -0.055804263800382614,
208
+ -0.35883089900016785,
209
+ 0.32918551564216614,
210
+ -0.4741072952747345,
211
+ 0.05971089377999306,
212
+ -0.062083590775728226,
213
+ 0.05729498714208603,
214
+ -0.6715519428253174,
215
+ 0.2646842896938324,
216
+ 0.14343565702438354,
217
+ 0.2957288324832916,
218
+ 0.37478363513946533,
219
+ -0.684753954410553,
220
+ -0.14382798969745636,
221
+ -0.3416562080383301,
222
+ 0.6120049953460693,
223
+ 0.24825794994831085,
224
+ 0.049689218401908875,
225
+ 0.08789665251970291,
226
+ -0.518900454044342,
227
+ -0.2226269692182541,
228
+ 0.17690403759479523,
229
+ 0.011226996779441833,
230
+ 0.05879935249686241,
231
+ 0.03022083267569542,
232
+ 0.11887083947658539,
233
+ 0.7854664325714111,
234
+ -0.2452417016029358,
235
+ 0.6136188507080078,
236
+ 0.5491909384727478,
237
+ -0.07412725687026978,
238
+ -0.3089025616645813,
239
+ 0.16618099808692932,
240
+ -0.03215228021144867,
241
+ 0.13637210428714752,
242
+ 0.10921650379896164,
243
+ -0.14989499747753143,
244
+ 0.6000584959983826,
245
+ 0.19014132022857666,
246
+ -0.007800411432981491,
247
+ -0.06849341839551926,
248
+ -0.19043166935443878,
249
+ -0.012874589301645756,
250
+ -0.8398106694221497,
251
+ -0.002614892553538084,
252
+ -0.26642924547195435,
253
+ 0.25869783759117126,
254
+ -0.46403658390045166,
255
+ 0.18120701611042023,
256
+ 0.08567068725824356,
257
+ 0.08117248862981796
258
+ ]
Dataset/Embeddings/std.json ADDED
@@ -0,0 +1,258 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ 0.8075656890869141,
3
+ 0.8826062679290771,
4
+ 0.8430591821670532,
5
+ 0.8703321814537048,
6
+ 0.877600371837616,
7
+ 0.8111068606376648,
8
+ 0.8719013929367065,
9
+ 0.9000007510185242,
10
+ 0.9740477800369263,
11
+ 0.8267052173614502,
12
+ 0.8011612296104431,
13
+ 0.9747788906097412,
14
+ 0.8026949763298035,
15
+ 0.8818342089653015,
16
+ 0.8605656623840332,
17
+ 0.8279756903648376,
18
+ 0.772606611251831,
19
+ 0.8957112431526184,
20
+ 0.8716765642166138,
21
+ 0.7797929644584656,
22
+ 0.8252673149108887,
23
+ 0.781441330909729,
24
+ 0.8043056130409241,
25
+ 0.877123236656189,
26
+ 0.9237406849861145,
27
+ 0.7914682030677795,
28
+ 0.9089431166648865,
29
+ 0.8154596090316772,
30
+ 0.8381725549697876,
31
+ 0.8573335409164429,
32
+ 0.7951206564903259,
33
+ 0.8356125354766846,
34
+ 0.8639358282089233,
35
+ 0.8588302135467529,
36
+ 0.8966045379638672,
37
+ 0.836276113986969,
38
+ 0.8558772206306458,
39
+ 0.8904256820678711,
40
+ 0.8009889721870422,
41
+ 0.9030625820159912,
42
+ 0.8489034175872803,
43
+ 0.7720499038696289,
44
+ 0.780423641204834,
45
+ 0.7854387760162354,
46
+ 0.8878417611122131,
47
+ 0.8503796458244324,
48
+ 0.8932433128356934,
49
+ 0.9315906763076782,
50
+ 0.8437496423721313,
51
+ 0.8389645218849182,
52
+ 0.8701387643814087,
53
+ 0.9080750942230225,
54
+ 1.0714792013168335,
55
+ 0.8976108431816101,
56
+ 0.8437362909317017,
57
+ 0.8633260726928711,
58
+ 0.8580045700073242,
59
+ 0.8063361644744873,
60
+ 0.8105617761611938,
61
+ 0.8995920419692993,
62
+ 0.8316185474395752,
63
+ 0.9079830050468445,
64
+ 0.8115889430046082,
65
+ 0.8792805671691895,
66
+ 0.8858475685119629,
67
+ 0.7682526111602783,
68
+ 0.8312106728553772,
69
+ 0.8296751379966736,
70
+ 0.9122119545936584,
71
+ 0.9119444489479065,
72
+ 0.8761489391326904,
73
+ 0.8376705646514893,
74
+ 0.9226043820381165,
75
+ 0.8830709457397461,
76
+ 0.819685161113739,
77
+ 0.9397792816162109,
78
+ 0.833674967288971,
79
+ 0.8619604110717773,
80
+ 0.8484258651733398,
81
+ 0.943915605545044,
82
+ 0.8020740151405334,
83
+ 0.8027610182762146,
84
+ 0.9116966724395752,
85
+ 0.8570717573165894,
86
+ 0.7944185733795166,
87
+ 0.8977150917053223,
88
+ 0.9434093236923218,
89
+ 0.9964787364006042,
90
+ 0.8149264454841614,
91
+ 0.8179062604904175,
92
+ 0.832256555557251,
93
+ 0.866649329662323,
94
+ 0.8442603349685669,
95
+ 0.9397143125534058,
96
+ 0.8501031398773193,
97
+ 0.9365203380584717,
98
+ 0.8380716443061829,
99
+ 0.8887302279472351,
100
+ 0.8084500432014465,
101
+ 0.7769243121147156,
102
+ 0.8449881076812744,
103
+ 0.9015783667564392,
104
+ 0.9295680522918701,
105
+ 0.8259174227714539,
106
+ 0.8573725819587708,
107
+ 0.8600193858146667,
108
+ 0.8780449032783508,
109
+ 0.8595342040061951,
110
+ 0.7720226049423218,
111
+ 0.816754937171936,
112
+ 0.8180097937583923,
113
+ 0.8093970417976379,
114
+ 0.9032255411148071,
115
+ 0.8697183728218079,
116
+ 0.888511061668396,
117
+ 0.7960647940635681,
118
+ 0.8589795827865601,
119
+ 0.8813145160675049,
120
+ 0.8638142347335815,
121
+ 0.9093354344367981,
122
+ 0.8201130628585815,
123
+ 0.8607465028762817,
124
+ 0.9925655722618103,
125
+ 0.9680612683296204,
126
+ 0.8303309679031372,
127
+ 0.8515812158584595,
128
+ 0.8854086399078369,
129
+ 0.8599415421485901,
130
+ 0.8196620941162109,
131
+ 0.9137897491455078,
132
+ 0.8218133449554443,
133
+ 0.8703830242156982,
134
+ 0.845089852809906,
135
+ 0.8652607202529907,
136
+ 0.877587080001831,
137
+ 0.834847629070282,
138
+ 0.7999405860900879,
139
+ 0.867475152015686,
140
+ 0.9779040217399597,
141
+ 0.8888542652130127,
142
+ 0.8318555951118469,
143
+ 0.8721846342086792,
144
+ 0.8582359552383423,
145
+ 0.8781721591949463,
146
+ 0.7750568389892578,
147
+ 0.9456684589385986,
148
+ 0.8390375971794128,
149
+ 0.8528217077255249,
150
+ 0.9676473736763,
151
+ 0.9669485092163086,
152
+ 0.8177183866500854,
153
+ 0.8109471201896667,
154
+ 0.8565740585327148,
155
+ 1.012668490409851,
156
+ 0.8075276017189026,
157
+ 0.8120420575141907,
158
+ 0.8192445039749146,
159
+ 0.9088258743286133,
160
+ 0.806582510471344,
161
+ 0.8778362274169922,
162
+ 0.9832965135574341,
163
+ 0.8517345190048218,
164
+ 0.8954508900642395,
165
+ 0.8626090288162231,
166
+ 0.8306634426116943,
167
+ 0.7902420163154602,
168
+ 0.8680355548858643,
169
+ 0.8405691385269165,
170
+ 0.8080191612243652,
171
+ 0.8716298937797546,
172
+ 0.8520878553390503,
173
+ 0.8133600354194641,
174
+ 0.9267045855522156,
175
+ 0.8689888715744019,
176
+ 0.8166713118553162,
177
+ 0.8387840390205383,
178
+ 0.835797131061554,
179
+ 0.8922353386878967,
180
+ 0.8736470937728882,
181
+ 0.9051007032394409,
182
+ 0.8347994685173035,
183
+ 0.8269197344779968,
184
+ 0.7968848943710327,
185
+ 0.8677981495857239,
186
+ 0.8539698719978333,
187
+ 0.9122839570045471,
188
+ 0.907562255859375,
189
+ 0.908149242401123,
190
+ 0.8897758722305298,
191
+ 0.8776298761367798,
192
+ 0.8702916502952576,
193
+ 0.7712435722351074,
194
+ 0.8737289905548096,
195
+ 1.003007411956787,
196
+ 0.9195813536643982,
197
+ 0.9373644590377808,
198
+ 0.8549340963363647,
199
+ 0.8885018229484558,
200
+ 0.8555989265441895,
201
+ 0.8315033316612244,
202
+ 0.8457157611846924,
203
+ 0.8452540636062622,
204
+ 0.9597710967063904,
205
+ 0.8279005885124207,
206
+ 0.9954813122749329,
207
+ 0.8817158937454224,
208
+ 0.8564739227294922,
209
+ 0.8737724423408508,
210
+ 0.8833761215209961,
211
+ 0.9069574475288391,
212
+ 0.8549059629440308,
213
+ 0.8478658199310303,
214
+ 0.8306840062141418,
215
+ 0.8308926820755005,
216
+ 0.8582388162612915,
217
+ 0.7912089228630066,
218
+ 0.843919038772583,
219
+ 0.8585576415061951,
220
+ 0.850679337978363,
221
+ 0.921983003616333,
222
+ 0.8164607882499695,
223
+ 0.8369028568267822,
224
+ 0.7947129607200623,
225
+ 0.8371235132217407,
226
+ 0.8269281387329102,
227
+ 0.8633431196212769,
228
+ 0.9147580862045288,
229
+ 0.9019842743873596,
230
+ 0.8293289542198181,
231
+ 0.8421900868415833,
232
+ 0.8144598603248596,
233
+ 0.9013247489929199,
234
+ 0.7653704285621643,
235
+ 0.8295224905014038,
236
+ 0.9549149870872498,
237
+ 0.8671613931655884,
238
+ 0.8507492542266846,
239
+ 0.8559182286262512,
240
+ 0.839141309261322,
241
+ 0.918213427066803,
242
+ 0.9064037203788757,
243
+ 0.8579128980636597,
244
+ 0.8337833881378174,
245
+ 0.9374175071716309,
246
+ 0.9142330884933472,
247
+ 0.7878691554069519,
248
+ 0.8651018142700195,
249
+ 0.8595719933509827,
250
+ 0.8955603837966919,
251
+ 0.9085484743118286,
252
+ 0.8001472353935242,
253
+ 0.7812052369117737,
254
+ 0.8475046157836914,
255
+ 0.8226194381713867,
256
+ 0.8940064311027527,
257
+ 0.9277697801589966
258
+ ]
Dataset/dataset.yaml ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ '7190_90542_000054_000000':
3
+ speaker_id: '7190'
4
+ example_id: '7190_90542_000054_000000'
5
+ '4830_25904_000008_000001':
6
+ speaker_id: '4830'
7
+ example_id: '4830_25904_000008_000001'
8
+ '8820_294120_000011_000001':
9
+ speaker_id: '8820'
10
+ example_id: '8820_294120_000011_000001'
11
+ '3009_10327_000027_000005':
12
+ speaker_id: '3009'
13
+ example_id: '3009_10327_000027_000005'
14
+ '7226_86965_000020_000001':
15
+ speaker_id: '7226'
16
+ example_id: '7226_86965_000020_000001'
17
+ '329_861_000024_000003':
18
+ speaker_id: '329'
19
+ example_id: '329_861_000024_000003'
20
+ '5802_76044_000038_000000':
21
+ speaker_id: '5802'
22
+ example_id: '5802_76044_000038_000000'
23
+ '1535_141644_000004_000001':
24
+ speaker_id: '1535'
25
+ example_id: '1535_141644_000004_000001'
26
+ '7011_66622_000032_000002':
27
+ speaker_id: '7011'
28
+ example_id: '7011_66622_000032_000002'
29
+ '8758_296465_000020_000000':
30
+ speaker_id: '8758'
31
+ example_id: '8758_296465_000020_000000'
32
+ '1034_121119_000028_000001':
33
+ speaker_id: '1034'
34
+ 'example_id': '1034_121119_000028_000001'
35
+ '4957_30119_000070_000001':
36
+ speaker_id: '4957'
37
+ example_id: '4957_30119_000070_000001'
38
+ '83_9960_000017_000003':
39
+ speaker_id: '83'
40
+ example_id: '83_9960_000017_000003'
41
+ '7059_77897_000017_000001':
42
+ speaker_id: '7059'
43
+ example_id: '7059_77897_000017_000001'
44
+ '1731_142320_000122_000005':
45
+ speaker_id: '1731'
46
+ example_id: '1731_142320_000122_000005'
47
+ '6918_47541_000006_000008':
48
+ speaker_id: '6918'
49
+ example_id: '6918_47541_000006_000008'
50
+ '6544_71420_000024_000001':
51
+ speaker_id: '6544'
52
+ example_id: '6544_71420_000024_000001'
53
+ '7245_104888_000016_000000':
54
+ speaker_id: '7245'
55
+ example_id: '7245_104888_000016_000000'
56
+ '5012_80192_000020_000003':
57
+ speaker_id: '5012'
58
+ example_id: '5012_80192_000020_000003'
59
+ '1422_149735_000006_000000':
60
+ speaker_id: '1422'
61
+ example_id: '1422_149735_000006_000000'
62
+ '14_212_000019_000000':
63
+ speaker_id: '14'
64
+ example_id: '14_212_000019_000000'
65
+ '1088_129236_000006_000007':
66
+ speaker_id: '1088'
67
+ example_id: '1088_129236_000006_000007'
app.py CHANGED
@@ -1,31 +1,39 @@
1
  import numpy as np
2
  from pathlib import Path
3
- import padertorch as pt
4
  import paderbox as pb
5
- import time
6
  import torch
7
- import torchaudio
8
  from onnxruntime import InferenceSession
9
  from pvq_manipulation.models.vits import Vits_NT
10
  from pvq_manipulation.models.ffjord import FFJORD
11
- from IPython.display import display, Audio, clear_output
12
  from pvq_manipulation.models.hubert import HubertExtractor, SID_LARGE_LAYER
13
  import librosa
14
  from pvq_manipulation.helper.vad import EnergyVAD
15
  import gradio as gr
 
16
 
17
- device = 'cpu' #'cuda' if torch.cuda.is_available() else 'cpu'
 
18
 
19
- # load tts model
20
- storage_dir_tts = Path("./models/tts_model/")
21
- tts_model = Vits_NT.load_model(storage_dir_tts, "model.pt")
 
 
 
 
 
 
 
22
 
23
  # load normalizing flow
24
  storage_dir_normalizing_flow = Path("./models/norm_flow")
25
- speaker_conditioning = pb.io.load(storage_dir_normalizing_flow / "speaker_conditioning.json")
26
-
27
  normalizing_flow = FFJORD.load_model(storage_dir_normalizing_flow, checkpoint="model.pt", device=device)
28
 
 
 
 
 
29
  # load hubert features model
30
  hubert_model = HubertExtractor(
31
  layer=SID_LARGE_LAYER,
@@ -35,140 +43,157 @@ hubert_model = HubertExtractor(
35
  # storage_dir= # target storage dir hubert model
36
  )
37
 
38
- # example synthesis
39
- # speaker_id = 1034
40
- # example_id = "1034_121119_000028_000001"
41
-
42
- # wav_1 = tts_model.synthesize_from_example({
43
- # 'text' : "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
44
- # 'd_vector_storage_root': f"./Saved_models/Dataset/Embeddings/{speaker_id}/{example_id}.pth"
45
- # })
46
- # display(Audio(wav_1, rate=24_000, normalize=True))
47
 
48
- # manipulation block
49
  def get_manipulation(
50
- d_vector,
51
  labels,
52
- flow,
53
  tts_model,
 
 
54
  manipulation_idx=0,
55
  manipulation_fkt=1,
56
  ):
57
  labels_manipulated = labels.clone()
58
- labels_manipulated[:,manipulation_idx] += manipulation_fkt
59
-
60
- output_forward = flow.forward((d_vector.float(), labels))[0]
 
 
 
 
 
 
 
 
 
 
61
  sampled_class_manipulated = flow.sample((output_forward, labels_manipulated))[0]
62
 
 
 
 
63
  wav = tts_model.synthesize_from_example({
64
- 'text': "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
65
  'd_vector': d_vector.detach().numpy(),
66
  'd_vector_man': sampled_class_manipulated.detach().numpy(),
67
- })
 
68
  return wav
69
 
70
- def extract_speaker_embedding(example):
71
- observation, sr = pb.io.load_audio(example['audio_path']['observation'], return_sample_rate=True)
72
- observation = librosa.resample(observation, orig_sr=sr, target_sr=16_000)
73
-
74
- vad = EnergyVAD(sample_rate=16_000)
75
- if observation.ndim == 1:
76
- observation = observation[None, :]
77
-
78
- observation = vad({'audio_data': observation})['audio_data']
79
-
80
- with torch.no_grad():
81
- example = tts_model.speaker_manager.prepare_example({'audio_data': {'observation': observation}, **example})
82
- example = pt.data.utils.collate_fn([example])
83
- example['features'] = torch.tensor(np.array(example['features']))
84
- d_vector = tts_model.speaker_manager.forward(example)[0]
85
- return d_vector
86
-
87
- # load speaker labels
88
- def load_speaker_labels(example, speaker_conditioning, reg_stor_dir=Path('./models/pvq_extractor/')):
89
- audio, _ = torchaudio.load(example['audio_path']['observation'])
90
- audio = audio.to(device)
91
- num_samples = torch.tensor([audio.shape[-1]], device=device)
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  providers = ["CPUExecutionProvider"]
94
 
95
  with torch.no_grad():
96
  features, seq_len = hubert_model(
97
- audio,
98
- 24_000,
99
  sequence_lengths=num_samples,
100
  )
101
  features = np.mean(features.squeeze(0).detach().cpu().numpy(), axis=-1)
102
-
103
  pvqd_predictions = {}
104
- for pvq in ['Breathiness', 'Loudness', 'Pitch', 'Resonance', 'Roughness', 'Strain', 'Weight']:
105
  with open(reg_stor_dir / f"{pvq}.onnx", "rb") as fid:
106
  onnx = fid.read()
107
  sess = InferenceSession(onnx, providers=providers)
108
  pred = sess.run(None, {"X": features[None]})[0].squeeze(1)
109
  pvqd_predictions[pvq] = pred.tolist()[0]
110
- labels = []
111
- for key in speaker_conditioning:
112
- labels.append(pvqd_predictions[key]/100)
113
- return torch.tensor(labels)
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
- example = {
117
- 'audio_path': {'observation': "audio/1034_121119_000028_000001.wav"},
118
- 'speaker_id': 1034,
119
- 'example_id': "1034_121119_000028_000001",
120
- }
121
 
122
- labels = load_speaker_labels(example, speaker_conditioning)
123
- label_options = ['Weight', 'Resonance', 'Breathiness', 'Roughness', 'Loudness', 'Strain', 'Pitch']
124
 
125
- # print('Estimated PVQ strengths of input speaker:')
126
- # max_len = max(len(name) for name in label_options)
127
- # for label_name, pvq in zip(label_options, labels):
128
- # print(f'{label_name:<{max_len}} : {pvq:6.2f}')
129
 
 
 
 
 
 
 
 
130
 
131
- def update_manipulation(manipulation_idx, manipulation_fkt):
 
 
 
 
 
 
 
 
132
 
133
- d_vector = extract_speaker_embedding(example)
134
- labels = load_speaker_labels(example, speaker_conditioning)
135
-
136
  wav_manipulated = get_manipulation(
137
- # example=example,
138
- d_vector=d_vector,
139
- labels=labels[None, :],
140
  flow=normalizing_flow,
141
  tts_model=tts_model,
142
  manipulation_idx=manipulation_idx,
143
  manipulation_fkt=manipulation_fkt,
 
144
  )
145
-
146
- wav_unmanipulated = tts_model.synthesize_from_example({
147
- 'text': "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
148
- 'd_vector': d_vector.detach().numpy(),
149
- })
150
- sr = 24_000
151
- return (sr, wav_unmanipulated), (sr, wav_manipulated)
152
-
153
- # with audio_output:
154
- # clear_output(wait=True)
155
- # print('Manipulated Speaker')
156
- # display(Audio(wav_manipulated, rate=24_000, normalize=True))
157
- # print('Unmanipulated Synthese')
158
- # display(Audio(wav_unmanipulated, rate=24_000, normalize=True))
159
- # print('Original Speaker')
160
- # display(Audio(example['audio_path']['observation'], rate=24_000, normalize=True))
161
-
162
- # print(f"Manipulated {label_options[manipulation_idx]} with strength {manipulation_fkt}")
163
-
164
-
165
- dropdown_options = [(label, i) for i, label in enumerate(label_options)]
166
  demo = gr.Interface(
167
  title="Perceptual Voice Quality (PVQ) Manipulation",
168
  fn=update_manipulation,
169
  inputs=[
170
- gr.Dropdown(label="PVQ Feature", choices=dropdown_options, value=2, type="index"),
171
- gr.Slider(label="Manipulation Factor", minimum=-2.0, maximum=2.0, value=1.0, step=0.1),
 
 
 
 
 
 
 
 
 
 
 
 
172
  ],
173
  outputs=[gr.Audio(label="original utterance"), gr.Audio(label="manipulated utterance")],
174
  )
 
1
  import numpy as np
2
  from pathlib import Path
 
3
  import paderbox as pb
 
4
  import torch
 
5
  from onnxruntime import InferenceSession
6
  from pvq_manipulation.models.vits import Vits_NT
7
  from pvq_manipulation.models.ffjord import FFJORD
 
8
  from pvq_manipulation.models.hubert import HubertExtractor, SID_LARGE_LAYER
9
  import librosa
10
  from pvq_manipulation.helper.vad import EnergyVAD
11
  import gradio as gr
12
+ from pvq_manipulation.helper.creapy_wrapper import process_file
13
 
14
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
15
+ pvq_labels = ['Weight', 'Resonance', 'Breathiness', 'Roughness', 'Loudness', 'Strain', 'Pitch']
16
 
17
+ dataset_dict = pb.io.load_yaml('./Dataset/dataset.yaml')
18
+
19
+ cached_example_id = None
20
+ cached_loaded_example = None
21
+ cached_labels = None
22
+ cached_d_vector = None
23
+ cached_unmanipulated = None
24
+
25
+ # path to stats
26
+ stats_path = Path('./Dataset/Embeddings/')
27
 
28
  # load normalizing flow
29
  storage_dir_normalizing_flow = Path("./models/norm_flow")
30
+ config_norm_flow = pb.io.load_yaml(storage_dir_normalizing_flow / "config.json")
 
31
  normalizing_flow = FFJORD.load_model(storage_dir_normalizing_flow, checkpoint="model.pt", device=device)
32
 
33
+ # load tts model
34
+ storage_dir_tts = Path("./models/tts_model/")
35
+ tts_model = Vits_NT.load_model(storage_dir_tts, "model.pt")
36
+
37
  # load hubert features model
38
  hubert_model = HubertExtractor(
39
  layer=SID_LARGE_LAYER,
 
43
  # storage_dir= # target storage dir hubert model
44
  )
45
 
 
 
 
 
 
 
 
 
 
46
 
 
47
  def get_manipulation(
48
+ example,
49
  labels,
50
+ flow,
51
  tts_model,
52
+ d_vector,
53
+ config_norm_flow,
54
  manipulation_idx=0,
55
  manipulation_fkt=1,
56
  ):
57
  labels_manipulated = labels.clone()
58
+ labels_manipulated[:, manipulation_idx] += manipulation_fkt
59
+
60
+ if config_norm_flow['flag_remove_mean']:
61
+ global_mean = pb.io.load(stats_path / "mean.json")
62
+ global_mean = torch.tensor(global_mean, dtype=torch.float32)
63
+ speaker_embedding_norm = (d_vector - global_mean)
64
+ global_std = pb.io.load(stats_path / "std.json")
65
+ global_std = torch.tensor(global_std, dtype=torch.float32)
66
+ speaker_embedding_norm = speaker_embedding_norm / global_std
67
+ else:
68
+ speaker_embedding_norm = d_vector
69
+
70
+ output_forward = flow.forward((speaker_embedding_norm.float(), labels))[0]
71
  sampled_class_manipulated = flow.sample((output_forward, labels_manipulated))[0]
72
 
73
+ if config_norm_flow['flag_remove_mean']:
74
+ sampled_class_manipulated = (sampled_class_manipulated * global_std + global_mean)
75
+
76
  wav = tts_model.synthesize_from_example({
77
+ 'text': example['transcription'],
78
  'd_vector': d_vector.detach().numpy(),
79
  'd_vector_man': sampled_class_manipulated.detach().numpy(),
80
+ 'd_vector_storage_root': example['d_vector_storage_root'],
81
+ })
82
  return wav
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ def get_creak_label(example):
86
+ audio_data = example['loaded_audio_data']['16_000']
87
+ test, y_pred, included_indices = process_file(audio_data)
88
+ mean_creak = np.mean(y_pred[included_indices])
89
+ return mean_creak * 100
90
+
91
+
92
+ def load_speaker_labels(example, reg_stor_dir=Path('./models/pvq_extractor/')):
93
+ audio_data = torch.tensor(example['loaded_audio_data']['16_000'], dtype=torch.float)[None, :]
94
+ num_samples = torch.tensor([audio_data.shape[-1]])
95
+
96
+ if torch.cuda.is_available():
97
+ audio_data = audio_data.cuda()
98
+ num_samples = num_samples.cuda()
99
  providers = ["CPUExecutionProvider"]
100
 
101
  with torch.no_grad():
102
  features, seq_len = hubert_model(
103
+ audio_data,
104
+ 16_000,
105
  sequence_lengths=num_samples,
106
  )
107
  features = np.mean(features.squeeze(0).detach().cpu().numpy(), axis=-1)
 
108
  pvqd_predictions = {}
109
+ for pvq in pvq_labels:
110
  with open(reg_stor_dir / f"{pvq}.onnx", "rb") as fid:
111
  onnx = fid.read()
112
  sess = InferenceSession(onnx, providers=providers)
113
  pred = sess.run(None, {"X": features[None]})[0].squeeze(1)
114
  pvqd_predictions[pvq] = pred.tolist()[0]
 
 
 
 
115
 
116
+ pvqd_predictions['Creak_mean'] = get_creak_label(example)
117
+ labels = [pvqd_predictions[key] / 100 for key in pvq_labels + ["Creak_mean"]]
118
+ return torch.tensor(labels, device=device).float()
119
+
120
+
121
+ def load_audio_files(example):
122
+ observation_loaded, sr = pb.io.load_audio(example['audio_path']['observation'], return_sample_rate=True)
123
+
124
+ example['loaded_audio_data'] = {}
125
+ observation = librosa.resample(observation_loaded, orig_sr=sr, target_sr=16_000)
126
+
127
+ vad = EnergyVAD(sample_rate=16_000)
128
+ if observation.ndim == 1:
129
+ observation = observation[None, :]
130
+
131
+ observation = vad({'audio_data': observation})['audio_data']
132
+ example['loaded_audio_data']['16_000'] = observation
133
+
134
+ observation = librosa.resample(observation, orig_sr=sr, target_sr=24_000)
135
+ vad = EnergyVAD(sample_rate=24_000)
136
+ if observation.ndim == 1:
137
+ observation = observation[None, :]
138
+ observation = vad({'audio_data': observation})['audio_data']
139
+ example['loaded_audio_data']['24_000'] = observation
140
+ return example
141
 
 
 
 
 
 
142
 
143
+ def update_manipulation(manipulation_idx, example_id, transcription, manipulation_fkt):
144
+ global cached_example_id, cached_loaded_example, cached_labels, cached_d_vector, example_database, cached_unmanipulated
145
 
146
+ speaker_id = dataset_dict['dataset'][example_id]['speaker_id']
 
 
 
147
 
148
+ example = {
149
+ 'audio_path': {'observation': f"./Dataset/Audio_files/{example_id}.wav"},
150
+ 'd_vector_storage_root': f"./Saved_models/Dataset/Embeddings/{speaker_id}/{example_id}.pth",
151
+ 'speaker_id': speaker_id,
152
+ 'example_id': example_id,
153
+ 'transcription': transcription
154
+ }
155
 
156
+ if cached_example_id != example_id:
157
+ cached_loaded_example = load_audio_files(example)
158
+ cached_d_vector = torch.load(f"./Dataset/Embeddings/{speaker_id}/{example_id}.pth")
159
+ cached_labels = load_speaker_labels(example)
160
+ cached_example_id = example_id
161
+ cached_unmanipulated = tts_model.synthesize_from_example({
162
+ 'text': transcription,
163
+ 'd_vector': cached_d_vector.detach().numpy(),
164
+ })
165
 
 
 
 
166
  wav_manipulated = get_manipulation(
167
+ example=example,
168
+ d_vector=cached_d_vector,
169
+ labels=cached_labels[None, :],
170
  flow=normalizing_flow,
171
  tts_model=tts_model,
172
  manipulation_idx=manipulation_idx,
173
  manipulation_fkt=manipulation_fkt,
174
+ config_norm_flow=config_norm_flow,
175
  )
176
+ return (24_000, cached_unmanipulated), (24_000, wav_manipulated)
177
+
178
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  demo = gr.Interface(
180
  title="Perceptual Voice Quality (PVQ) Manipulation",
181
  fn=update_manipulation,
182
  inputs=[
183
+ gr.Dropdown(
184
+ label="PVQ Feature",
185
+ choices=[('Weight', 0), ('Resonance', 1), ('Breathiness', 2), ('Roughness', 3), ('Creak', 7)],
186
+ value=2, type="value"
187
+ ),
188
+ gr.Dropdown(
189
+ choices=dataset_dict['dataset'].keys(),
190
+ value='1422_149735_000006_000000', type="value"
191
+ ),
192
+ gr.Textbox(
193
+ value="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
194
+ placeholder='Type something'
195
+ ),
196
+ gr.Slider(label="Manipulation Factor", minimum=-1.0, maximum=2.0, value=1.0, step=0.1),
197
  ],
198
  outputs=[gr.Audio(label="original utterance"), gr.Audio(label="manipulated utterance")],
199
  )
models/norm_flow/config.json CHANGED
@@ -1,12 +1,15 @@
1
  {
2
- "factory": "pvq_manipulation.models.ffjord.FFJORD",
3
- "normalize": true,
4
- "ode_function": {
5
- "condition_dim": 7,
6
- "factory": "pvq_manipulation.models.ode_functions.CNFNN",
7
- "hidden_channels": [
8
- 512
9
- ],
10
- "input_dim": 256
11
- }
12
- }
 
 
 
 
1
  {
2
+ "model":{
3
+ "factory": "pvq_manipulation.models.ffjord.FFJORD",
4
+ "normalize": true,
5
+ "ode_function": {
6
+ "condition_dim": 8,
7
+ "factory": "pvq_manipulation.models.ode_functions.CNFNN",
8
+ "hidden_channels": [
9
+ 512
10
+ ],
11
+ "input_dim": 256
12
+ }
13
+ },
14
+ "flag_remove_mean": true
15
+ }