Spaces:

fgnt-upb
/

pvq_manipulation

Sleeping

App Files Files Community

FrederikRautenberg commited on Jun 2

Commit

4732065

1 Parent(s): 80d202c

Add creak manipulation

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
Dataset/Audio_files/1034_121119_000028_000001.wav +3 -0
Dataset/Audio_files/1088_129236_000006_000007.wav +3 -0
Dataset/Audio_files/1422_149735_000006_000000.wav +3 -0
Dataset/Audio_files/14_212_000019_000000.wav +3 -0
Dataset/Audio_files/1535_141644_000004_000001.wav +3 -0
Dataset/Audio_files/1731_142320_000122_000005.wav +3 -0
Dataset/Audio_files/3009_10327_000027_000005.wav +3 -0
Dataset/Audio_files/329_861_000024_000003.wav +3 -0
Dataset/Audio_files/4830_25904_000008_000001.wav +3 -0
Dataset/Audio_files/4957_30119_000070_000001.wav +3 -0
Dataset/Audio_files/5012_80192_000020_000003.wav +3 -0
Dataset/Audio_files/5802_76044_000038_000000.wav +3 -0
Dataset/Audio_files/6544_71420_000024_000001.wav +3 -0
Dataset/Audio_files/6918_47541_000006_000008.wav +3 -0
Dataset/Audio_files/7011_66622_000032_000002.wav +3 -0
Dataset/Audio_files/7059_77897_000017_000001.wav +3 -0
Dataset/Audio_files/7190_90542_000054_000000.wav +3 -0
Dataset/Audio_files/7226_86965_000020_000001.wav +3 -0
Dataset/Audio_files/7245_104888_000016_000000.wav +3 -0
Dataset/Audio_files/83_9960_000017_000003.wav +3 -0
Dataset/Audio_files/8758_296465_000020_000000.wav +3 -0
Dataset/Audio_files/8820_294120_000011_000001.wav +3 -0
Dataset/Embeddings/1034/1034_121119_000028_000001.pth +3 -0
Dataset/Embeddings/1088/1088_129236_000006_000007.pth +3 -0
Dataset/Embeddings/14/14_212_000019_000000.pth +3 -0
Dataset/Embeddings/1422/1422_149735_000006_000000.pth +3 -0
Dataset/Embeddings/1535/1535_141644_000004_000001.pth +3 -0
Dataset/Embeddings/1731/1731_142320_000122_000005.pth +3 -0
Dataset/Embeddings/3009/3009_10327_000027_000005.pth +3 -0
Dataset/Embeddings/329/329_861_000024_000003.pth +3 -0
Dataset/Embeddings/4830/4830_25904_000008_000001.pth +3 -0
Dataset/Embeddings/4957/4957_30119_000070_000001.pth +3 -0
Dataset/Embeddings/5012/5012_80192_000020_000003.pth +3 -0
Dataset/Embeddings/5802/5802_76044_000038_000000.pth +3 -0
Dataset/Embeddings/6544/6544_71420_000024_000001.pth +3 -0
Dataset/Embeddings/6918/6918_47541_000006_000008.pth +3 -0
Dataset/Embeddings/7011/7011_66622_000032_000002.pth +3 -0
Dataset/Embeddings/7059/7059_77897_000017_000001.pth +3 -0
Dataset/Embeddings/7190/7190_90542_000054_000000.pth +3 -0
Dataset/Embeddings/7226/7226_86965_000020_000001.pth +3 -0
Dataset/Embeddings/7245/7245_104888_000016_000000.pth +3 -0
Dataset/Embeddings/83/83_9960_000017_000003.pth +3 -0
Dataset/Embeddings/8758/8758_296465_000020_000000.pth +3 -0
Dataset/Embeddings/8820/8820_294120_000011_000001.pth +3 -0
Dataset/Embeddings/mean.json +258 -0
Dataset/Embeddings/std.json +258 -0
Dataset/dataset.yaml +67 -0
app.py +123 -98
models/norm_flow/config.json +14 -11

.gitattributes CHANGED Viewed

@@ -44,3 +44,5 @@ models/pvq_extractor/Resonance.onnx filter=lfs diff=lfs merge=lfs -text
 models/pvq_extractor/Weight.onnx filter=lfs diff=lfs merge=lfs -text
 models/norm_flow/model.pt filter=lfs diff=lfs merge=lfs -text
 audio/1034_121119_000028_000001.wav filter=lfs diff=lfs merge=lfs -text

 models/pvq_extractor/Weight.onnx filter=lfs diff=lfs merge=lfs -text
 models/norm_flow/model.pt filter=lfs diff=lfs merge=lfs -text
 audio/1034_121119_000028_000001.wav filter=lfs diff=lfs merge=lfs -text
+Dataset/Audio_files/*.wav filter=lfs diff=lfs merge=lfs -text
+Dataset/Embeddings/**/*.pth filter=lfs diff=lfs merge=lfs -text

Dataset/Audio_files/1034_121119_000028_000001.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dc00c4e893ccf708cae4366e36ede93b4e158f516323a0724fc6e9f956c76aff
+size 385964

Dataset/Audio_files/1088_129236_000006_000007.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27b2e7191ba1cfad41bc1ab1bd09ec1af87062e48abbab1ef01809c76ed738da
+size 311084

Dataset/Audio_files/1422_149735_000006_000000.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f59d7f7a3c7364d7ac254bd94d3384e9b8e173634eb8b7492ec751d8584f8bb5
+size 345644

Dataset/Audio_files/14_212_000019_000000.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1271f49cf4855d1b7d9b87e99a0c79e5505acbfba94cd8f594c1df2a29d96027
+size 633652

Dataset/Audio_files/1535_141644_000004_000001.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cc6f0bc3b9ebecbc1dab5a430c37140337a6bbeaf6f75103d74b2b4e75b4f06
+size 295724

Dataset/Audio_files/1731_142320_000122_000005.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3d3e88bc53ee1cad73100ea4ad6ccc6d9bcbc36145962d400122b658e27b7e8
+size 316844

Dataset/Audio_files/3009_10327_000027_000005.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bbe46722ef2d331a5bc1c552cd6ad3c8a69022a3c70b1c03b609856dc073ca32
+size 309164

Dataset/Audio_files/329_861_000024_000003.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7dab66421315b9f22bbbaf909e69184c01eaba29e536c2b449c8a7310f2edce7
+size 261164

Dataset/Audio_files/4830_25904_000008_000001.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd41aaf86c9d6e394d9afcca5e3128aa6a52fd2948e3bcf6aa03e5c18f2c7eec
+size 483884

Dataset/Audio_files/4957_30119_000070_000001.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f483b0a7003610ba8451db035f3347b156bb348c7aa356b7403f8ca86b98ab28
+size 503084

Dataset/Audio_files/5012_80192_000020_000003.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91aa9243a6697d65e6f5464b40e9b420b5e5cdef83b64a5556baef1ac548f11e
+size 409004

Dataset/Audio_files/5802_76044_000038_000000.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22964325ee8f751dddd136b3219191443270529d95ee27b45c4a789501286492
+size 460844

Dataset/Audio_files/6544_71420_000024_000001.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1289b6714acb263b8bb36d6acfbb4efded0a5c67cc9b6a6246340dd3493c6c2b
+size 209324

Dataset/Audio_files/6918_47541_000006_000008.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8dcd7955f5469755038482a58f0929012526f98130513acd9d0cd1bc208bbfe8
+size 898612

Dataset/Audio_files/7011_66622_000032_000002.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:111c88a591efa42a608d1609214e6ef56a64f3bd79a88b57efecba2ca2f7ed4c
+size 309164

Dataset/Audio_files/7059_77897_000017_000001.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17f51a3d2133e81607e36403b6bcb8bd7ec9e03c1bcfbbc80b4123c1b31d6618
+size 243884

Dataset/Audio_files/7190_90542_000054_000000.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:34bdcfbdf51f3475465804b9dbf27f8e647ccc1af17573b0a923f44881217093
+size 222764

Dataset/Audio_files/7226_86965_000020_000001.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b824a36ceaeec45724088957b1e543ee3b477ca1ee55e4c55e96ac8c2b018fb5
+size 622132

Dataset/Audio_files/7245_104888_000016_000000.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92d74fe5965fff3182cb1f273c80ea051033c7aa0dbbb44ae48ccded15210216
+size 341804

Dataset/Audio_files/83_9960_000017_000003.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a1cdc21d779c1d108af86ec6a93558a501322a67c221c25e2dd32d93e0c356a
+size 192044

Dataset/Audio_files/8758_296465_000020_000000.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8adafe1ab7b3e86c82454c06863dd616c5b52f91ebb8690fcc64ec7abb2821dc
+size 520364

Dataset/Audio_files/8820_294120_000011_000001.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b78d6b2aed1a99e8f3750bd54c50e8ed2e08dba114792fa604101faf27894708
+size 213164

Dataset/Embeddings/1034/1034_121119_000028_000001.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f4a7bdd020bf0da6fb08d272448c8b61c6f065e529084ce1cf9c39c1636e017c
+size 2358

Dataset/Embeddings/1088/1088_129236_000006_000007.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1544023ea8afb9b0c71fa31e1e16d2ec510cf9d8637a64648941448c9e5e18ae
+size 2358

Dataset/Embeddings/14/14_212_000019_000000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f248135ffdacc81ef4b5071f564448d49c2341b5c5c14bf4257af633f9318fd
+size 2269

Dataset/Embeddings/1422/1422_149735_000006_000000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fee0b79857cf8ce499a658dfeb5137d5b4fa7e849dd8118c32028391b88b3d08
+size 2358

Dataset/Embeddings/1535/1535_141644_000004_000001.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d71a10862f81b5a3f0877f6eb26cd4bc733e0cc9868acc65a65bb23ffe304b9
+size 2358

Dataset/Embeddings/1731/1731_142320_000122_000005.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79f2de5cb40487fe19b5099b57fe0a41f0436f554019c619bcb4cd9d6c64bf36
+size 2358

Dataset/Embeddings/3009/3009_10327_000027_000005.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f24187ad06ecbe02df165538c6881192cfd055b5a3cc5ab1348d2c05d6567421
+size 2353

Dataset/Embeddings/329/329_861_000024_000003.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc001f48b3f67d25192967e0a297dc1787144e36222e5b83a71ae6f5b89be9b3
+size 2274

Dataset/Embeddings/4830/4830_25904_000008_000001.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:170bdd680d0735a19c5e88e01fc8bf84dac623d7c73eebeff6e99974b8e9d081
+size 2353

Dataset/Embeddings/4957/4957_30119_000070_000001.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c922316a446bcc28db8a43f768ade2b2113ce0f6fab24b60b396f67264ce07c8
+size 2353

Dataset/Embeddings/5012/5012_80192_000020_000003.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba1c17f5100b1e0147e9c96d864cc054e8840a15cd46307e191fbe88a728b1b0
+size 2353

Dataset/Embeddings/5802/5802_76044_000038_000000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7656515f537fa6de193f40d78c9747cfb1268266d3dd88a22a41ce2c3a28514a
+size 2353

Dataset/Embeddings/6544/6544_71420_000024_000001.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f70c9bd92dea6ddfd495c7ab32cae30494eaf3b42f6d6533ff9f55de80593f05
+size 2353

Dataset/Embeddings/6918/6918_47541_000006_000008.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ed49a76c226606f98ce4c2db2aac937354e40cc8fb789e29e93aa87f64bc01d1
+size 2353

Dataset/Embeddings/7011/7011_66622_000032_000002.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43e63641af7d4322b89489acb9c10cfc7e71961bd6479c55c17135b3ecfa5605
+size 2353

Dataset/Embeddings/7059/7059_77897_000017_000001.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:688e50692694cccbe5f61c8780e0980509118f4061a44180ec8dffff2d963921
+size 2353

Dataset/Embeddings/7190/7190_90542_000054_000000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f1925fcb8ce5ffa8b9223de17ea8d98c0abb24409852208f03c607374c9f60a
+size 2353

Dataset/Embeddings/7226/7226_86965_000020_000001.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f88a82eee39139ab65f3b201f2657b795ad66d70ccd637f903d537df2acaca0
+size 2353

Dataset/Embeddings/7245/7245_104888_000016_000000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db50270eb45aab4344720a1da44d3c9d91ace10e69514287b3174ba9c2ca208a
+size 2358

Dataset/Embeddings/83/83_9960_000017_000003.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:958832b7e4e77f6eb8343b91091c8603b683b25c03f242e6de4b09952a0fba6d
+size 2274

Dataset/Embeddings/8758/8758_296465_000020_000000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:465be53ae1d0a44ccdb90e0fcaccf09a0ae91041f984ef18f606df0169ea8f3e
+size 2358

Dataset/Embeddings/8820/8820_294120_000011_000001.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b79ee0d4796df0776bc0ddfc8683f2a025c4829893f28b3cff6b4a2d5405d968
+size 2358

Dataset/Embeddings/mean.json ADDED Viewed

	@@ -0,0 +1,258 @@

+[
+  0.21412190794944763,
+  0.18206638097763062,
+  0.11840786784887314,
+  0.09126990288496017,
+  0.04086871072649956,
+  -0.149668350815773,
+  0.2645065188407898,
+  0.27953410148620605,
+  0.6700411438941956,
+  -0.06264923512935638,
+  0.2915269732475281,
+  0.12102372199296951,
+  -0.5578641891479492,
+  -0.12462181597948074,
+  0.6190101504325867,
+  -0.5761605501174927,
+  -0.084229975938797,
+  -0.0006869725184515119,
+  0.49899742007255554,
+  -0.21737882494926453,
+  -0.05707789212465286,
+  -0.18819154798984528,
+  -0.5531325340270996,
+  0.22641371190547943,
+  0.07952054589986801,
+  0.09851367026567459,
+  0.03574512526392937,
+  -0.13013364374637604,
+  -0.35363155603408813,
+  0.49086689949035645,
+  0.08895495533943176,
+  0.36905843019485474,
+  -0.10707297921180725,
+  -0.11953406780958176,
+  0.043051160871982574,
+  0.09323996305465698,
+  -0.16280269622802734,
+  -0.13945965468883514,
+  0.2095673531293869,
+  0.09729334712028503,
+  0.040950167924165726,
+  -0.37764972448349,
+  -0.018613651394844055,
+  -0.581308901309967,
+  -0.4080854058265686,
+  -0.42118221521377563,
+  1.0161728858947754,
+  -0.19709929823875427,
+  -0.024254681542515755,
+  0.04121233895421028,
+  -0.15502692759037018,
+  0.7614311575889587,
+  -0.6833258271217346,
+  0.33979618549346924,
+  0.49055442214012146,
+  0.011953921988606453,
+  0.4490082263946533,
+  0.2667522728443146,
+  -0.6408993005752563,
+  -0.17682728171348572,
+  0.12336420267820358,
+  0.1474267542362213,
+  -0.11565382778644562,
+  0.6467825174331665,
+  0.10751526057720184,
+  -0.14141449332237244,
+  0.6352338194847107,
+  -0.04154682531952858,
+  0.12760530412197113,
+  -0.6243913769721985,
+  0.08836925774812698,
+  0.28105032444000244,
+  -0.15209053456783295,
+  -0.0037005548365414143,
+  0.3098902106285095,
+  0.150644913315773,
+  0.07396118342876434,
+  -0.049714382737874985,
+  -0.5445783138275146,
+  -0.033714842051267624,
+  0.1200188472867012,
+  -0.2312866747379303,
+  0.20238173007965088,
+  -0.5392364263534546,
+  -0.40682801604270935,
+  -0.16234233975410461,
+  -0.6470288634300232,
+  -0.1738162636756897,
+  0.25936004519462585,
+  -0.15742169320583344,
+  0.24468930065631866,
+  0.13714095950126648,
+  0.1449803113937378,
+  0.16882915794849396,
+  0.19944046437740326,
+  -0.29332247376441956,
+  0.0026240404695272446,
+  0.03341501206159592,
+  0.01569036766886711,
+  -0.4688950777053833,
+  0.09352052956819534,
+  0.13269393146038055,
+  0.06116529926657677,
+  -0.06562789529561996,
+  -0.23961076140403748,
+  -0.22402845323085785,
+  0.47103151679039,
+  0.0728374496102333,
+  -0.561316192150116,
+  0.46127453446388245,
+  0.15431830286979675,
+  0.08550310134887695,
+  -0.03363621234893799,
+  0.04015417397022247,
+  -0.014262784272432327,
+  0.08499719202518463,
+  -0.39322608709335327,
+  0.27674373984336853,
+  0.24571490287780762,
+  -0.2642858326435089,
+  -0.7408877015113831,
+  0.21007885038852692,
+  0.5898057222366333,
+  0.14988923072814941,
+  -0.07782910019159317,
+  0.4078785479068756,
+  0.3004123270511627,
+  0.6256987452507019,
+  -0.21651767194271088,
+  -0.17712117731571198,
+  -0.2749980688095093,
+  0.4826784133911133,
+  0.3035520911216736,
+  0.23235619068145752,
+  -0.061135340481996536,
+  0.49035653471946716,
+  -0.16356635093688965,
+  -0.35920438170433044,
+  0.023298246785998344,
+  0.015880409628152847,
+  -0.015357445925474167,
+  -0.3540240228176117,
+  0.44811102747917175,
+  -0.05202110856771469,
+  -0.19488674402236938,
+  0.4875786602497101,
+  -0.03857485204935074,
+  0.463600754737854,
+  -0.07009128481149673,
+  0.29871219396591187,
+  -0.35601672530174255,
+  0.5102726817131042,
+  0.3902379274368286,
+  0.3692609369754791,
+  -0.35389819741249084,
+  0.07650414854288101,
+  -0.63330078125,
+  0.5580229759216309,
+  0.10672216862440109,
+  0.10609150677919388,
+  0.45468848943710327,
+  0.15291742980480194,
+  0.36706316471099854,
+  -0.2831500768661499,
+  -0.14291781187057495,
+  -0.17804013192653656,
+  -0.5424429178237915,
+  -0.15468499064445496,
+  0.07343851029872894,
+  0.5380398631095886,
+  0.44494226574897766,
+  0.9300274848937988,
+  -0.0274032074958086,
+  0.3488404154777527,
+  -0.23694315552711487,
+  -0.2424279898405075,
+  -0.04125871881842613,
+  0.06136211380362511,
+  -0.5118930339813232,
+  -0.15055209398269653,
+  0.45361533761024475,
+  0.12657225131988525,
+  0.34210655093193054,
+  0.313772052526474,
+  -0.3521589934825897,
+  0.05892332270741463,
+  -0.11534406244754791,
+  0.514985203742981,
+  0.054903097450733185,
+  0.18034562468528748,
+  0.26060545444488525,
+  -0.29317837953567505,
+  0.1423174887895584,
+  0.25360995531082153,
+  -0.47162681818008423,
+  0.5438259243965149,
+  0.02562086470425129,
+  0.020302919670939445,
+  0.3039097189903259,
+  0.19996808469295502,
+  0.3423006236553192,
+  0.4524010717868805,
+  -0.3152591586112976,
+  -0.60369873046875,
+  0.16421166062355042,
+  -0.055804263800382614,
+  -0.35883089900016785,
+  0.32918551564216614,
+  -0.4741072952747345,
+  0.05971089377999306,
+  -0.062083590775728226,
+  0.05729498714208603,
+  -0.6715519428253174,
+  0.2646842896938324,
+  0.14343565702438354,
+  0.2957288324832916,
+  0.37478363513946533,
+  -0.684753954410553,
+  -0.14382798969745636,
+  -0.3416562080383301,
+  0.6120049953460693,
+  0.24825794994831085,
+  0.049689218401908875,
+  0.08789665251970291,
+  -0.518900454044342,
+  -0.2226269692182541,
+  0.17690403759479523,
+  0.011226996779441833,
+  0.05879935249686241,
+  0.03022083267569542,
+  0.11887083947658539,
+  0.7854664325714111,
+  -0.2452417016029358,
+  0.6136188507080078,
+  0.5491909384727478,
+  -0.07412725687026978,
+  -0.3089025616645813,
+  0.16618099808692932,
+  -0.03215228021144867,
+  0.13637210428714752,
+  0.10921650379896164,
+  -0.14989499747753143,
+  0.6000584959983826,
+  0.19014132022857666,
+  -0.007800411432981491,
+  -0.06849341839551926,
+  -0.19043166935443878,
+  -0.012874589301645756,
+  -0.8398106694221497,
+  -0.002614892553538084,
+  -0.26642924547195435,
+  0.25869783759117126,
+  -0.46403658390045166,
+  0.18120701611042023,
+  0.08567068725824356,
+  0.08117248862981796
+]

Dataset/Embeddings/std.json ADDED Viewed

	@@ -0,0 +1,258 @@

+[
+  0.8075656890869141,
+  0.8826062679290771,
+  0.8430591821670532,
+  0.8703321814537048,
+  0.877600371837616,
+  0.8111068606376648,
+  0.8719013929367065,
+  0.9000007510185242,
+  0.9740477800369263,
+  0.8267052173614502,
+  0.8011612296104431,
+  0.9747788906097412,
+  0.8026949763298035,
+  0.8818342089653015,
+  0.8605656623840332,
+  0.8279756903648376,
+  0.772606611251831,
+  0.8957112431526184,
+  0.8716765642166138,
+  0.7797929644584656,
+  0.8252673149108887,
+  0.781441330909729,
+  0.8043056130409241,
+  0.877123236656189,
+  0.9237406849861145,
+  0.7914682030677795,
+  0.9089431166648865,
+  0.8154596090316772,
+  0.8381725549697876,
+  0.8573335409164429,
+  0.7951206564903259,
+  0.8356125354766846,
+  0.8639358282089233,
+  0.8588302135467529,
+  0.8966045379638672,
+  0.836276113986969,
+  0.8558772206306458,
+  0.8904256820678711,
+  0.8009889721870422,
+  0.9030625820159912,
+  0.8489034175872803,
+  0.7720499038696289,
+  0.780423641204834,
+  0.7854387760162354,
+  0.8878417611122131,
+  0.8503796458244324,
+  0.8932433128356934,
+  0.9315906763076782,
+  0.8437496423721313,
+  0.8389645218849182,
+  0.8701387643814087,
+  0.9080750942230225,
+  1.0714792013168335,
+  0.8976108431816101,
+  0.8437362909317017,
+  0.8633260726928711,
+  0.8580045700073242,
+  0.8063361644744873,
+  0.8105617761611938,
+  0.8995920419692993,
+  0.8316185474395752,
+  0.9079830050468445,
+  0.8115889430046082,
+  0.8792805671691895,
+  0.8858475685119629,
+  0.7682526111602783,
+  0.8312106728553772,
+  0.8296751379966736,
+  0.9122119545936584,
+  0.9119444489479065,
+  0.8761489391326904,
+  0.8376705646514893,
+  0.9226043820381165,
+  0.8830709457397461,
+  0.819685161113739,
+  0.9397792816162109,
+  0.833674967288971,
+  0.8619604110717773,
+  0.8484258651733398,
+  0.943915605545044,
+  0.8020740151405334,
+  0.8027610182762146,
+  0.9116966724395752,
+  0.8570717573165894,
+  0.7944185733795166,
+  0.8977150917053223,
+  0.9434093236923218,
+  0.9964787364006042,
+  0.8149264454841614,
+  0.8179062604904175,
+  0.832256555557251,
+  0.866649329662323,
+  0.8442603349685669,
+  0.9397143125534058,
+  0.8501031398773193,
+  0.9365203380584717,
+  0.8380716443061829,
+  0.8887302279472351,
+  0.8084500432014465,
+  0.7769243121147156,
+  0.8449881076812744,
+  0.9015783667564392,
+  0.9295680522918701,
+  0.8259174227714539,
+  0.8573725819587708,
+  0.8600193858146667,
+  0.8780449032783508,
+  0.8595342040061951,
+  0.7720226049423218,
+  0.816754937171936,
+  0.8180097937583923,
+  0.8093970417976379,
+  0.9032255411148071,
+  0.8697183728218079,
+  0.888511061668396,
+  0.7960647940635681,
+  0.8589795827865601,
+  0.8813145160675049,
+  0.8638142347335815,
+  0.9093354344367981,
+  0.8201130628585815,
+  0.8607465028762817,
+  0.9925655722618103,
+  0.9680612683296204,
+  0.8303309679031372,
+  0.8515812158584595,
+  0.8854086399078369,
+  0.8599415421485901,
+  0.8196620941162109,
+  0.9137897491455078,
+  0.8218133449554443,
+  0.8703830242156982,
+  0.845089852809906,
+  0.8652607202529907,
+  0.877587080001831,
+  0.834847629070282,
+  0.7999405860900879,
+  0.867475152015686,
+  0.9779040217399597,
+  0.8888542652130127,
+  0.8318555951118469,
+  0.8721846342086792,
+  0.8582359552383423,
+  0.8781721591949463,
+  0.7750568389892578,
+  0.9456684589385986,
+  0.8390375971794128,
+  0.8528217077255249,
+  0.9676473736763,
+  0.9669485092163086,
+  0.8177183866500854,
+  0.8109471201896667,
+  0.8565740585327148,
+  1.012668490409851,
+  0.8075276017189026,
+  0.8120420575141907,
+  0.8192445039749146,
+  0.9088258743286133,
+  0.806582510471344,
+  0.8778362274169922,
+  0.9832965135574341,
+  0.8517345190048218,
+  0.8954508900642395,
+  0.8626090288162231,
+  0.8306634426116943,
+  0.7902420163154602,
+  0.8680355548858643,
+  0.8405691385269165,
+  0.8080191612243652,
+  0.8716298937797546,
+  0.8520878553390503,
+  0.8133600354194641,
+  0.9267045855522156,
+  0.8689888715744019,
+  0.8166713118553162,
+  0.8387840390205383,
+  0.835797131061554,
+  0.8922353386878967,
+  0.8736470937728882,
+  0.9051007032394409,
+  0.8347994685173035,
+  0.8269197344779968,
+  0.7968848943710327,
+  0.8677981495857239,
+  0.8539698719978333,
+  0.9122839570045471,
+  0.907562255859375,
+  0.908149242401123,
+  0.8897758722305298,
+  0.8776298761367798,
+  0.8702916502952576,
+  0.7712435722351074,
+  0.8737289905548096,
+  1.003007411956787,
+  0.9195813536643982,
+  0.9373644590377808,
+  0.8549340963363647,
+  0.8885018229484558,
+  0.8555989265441895,
+  0.8315033316612244,
+  0.8457157611846924,
+  0.8452540636062622,
+  0.9597710967063904,
+  0.8279005885124207,
+  0.9954813122749329,
+  0.8817158937454224,
+  0.8564739227294922,
+  0.8737724423408508,
+  0.8833761215209961,
+  0.9069574475288391,
+  0.8549059629440308,
+  0.8478658199310303,
+  0.8306840062141418,
+  0.8308926820755005,
+  0.8582388162612915,
+  0.7912089228630066,
+  0.843919038772583,
+  0.8585576415061951,
+  0.850679337978363,
+  0.921983003616333,
+  0.8164607882499695,
+  0.8369028568267822,
+  0.7947129607200623,
+  0.8371235132217407,
+  0.8269281387329102,
+  0.8633431196212769,
+  0.9147580862045288,
+  0.9019842743873596,
+  0.8293289542198181,
+  0.8421900868415833,
+  0.8144598603248596,
+  0.9013247489929199,
+  0.7653704285621643,
+  0.8295224905014038,
+  0.9549149870872498,
+  0.8671613931655884,
+  0.8507492542266846,
+  0.8559182286262512,
+  0.839141309261322,
+  0.918213427066803,
+  0.9064037203788757,
+  0.8579128980636597,
+  0.8337833881378174,
+  0.9374175071716309,
+  0.9142330884933472,
+  0.7878691554069519,
+  0.8651018142700195,
+  0.8595719933509827,
+  0.8955603837966919,
+  0.9085484743118286,
+  0.8001472353935242,
+  0.7812052369117737,
+  0.8475046157836914,
+  0.8226194381713867,
+  0.8940064311027527,
+  0.9277697801589966
+]

Dataset/dataset.yaml ADDED Viewed

	@@ -0,0 +1,67 @@

+dataset:
+  '7190_90542_000054_000000':
+    speaker_id: '7190'
+    example_id: '7190_90542_000054_000000'
+  '4830_25904_000008_000001':
+    speaker_id: '4830'
+    example_id: '4830_25904_000008_000001'
+  '8820_294120_000011_000001':
+    speaker_id: '8820'
+    example_id: '8820_294120_000011_000001'
+  '3009_10327_000027_000005':
+    speaker_id: '3009'
+    example_id: '3009_10327_000027_000005'
+  '7226_86965_000020_000001':
+    speaker_id: '7226'
+    example_id: '7226_86965_000020_000001'
+  '329_861_000024_000003':
+    speaker_id: '329'
+    example_id: '329_861_000024_000003'
+  '5802_76044_000038_000000':
+    speaker_id: '5802'
+    example_id: '5802_76044_000038_000000'
+  '1535_141644_000004_000001':
+    speaker_id: '1535'
+    example_id: '1535_141644_000004_000001'
+  '7011_66622_000032_000002':
+    speaker_id: '7011'
+    example_id: '7011_66622_000032_000002'
+  '8758_296465_000020_000000':
+    speaker_id: '8758'
+    example_id: '8758_296465_000020_000000'
+  '1034_121119_000028_000001':
+    speaker_id: '1034'
+    'example_id': '1034_121119_000028_000001'
+  '4957_30119_000070_000001':
+    speaker_id: '4957'
+    example_id: '4957_30119_000070_000001'
+  '83_9960_000017_000003':
+    speaker_id: '83'
+    example_id: '83_9960_000017_000003'
+  '7059_77897_000017_000001':
+    speaker_id: '7059'
+    example_id: '7059_77897_000017_000001'
+  '1731_142320_000122_000005':
+    speaker_id: '1731'
+    example_id: '1731_142320_000122_000005'
+  '6918_47541_000006_000008':
+    speaker_id: '6918'
+    example_id: '6918_47541_000006_000008'
+  '6544_71420_000024_000001':
+    speaker_id: '6544'
+    example_id: '6544_71420_000024_000001'
+  '7245_104888_000016_000000':
+    speaker_id: '7245'
+    example_id: '7245_104888_000016_000000'
+  '5012_80192_000020_000003':
+    speaker_id: '5012'
+    example_id: '5012_80192_000020_000003'
+  '1422_149735_000006_000000':
+    speaker_id: '1422'
+    example_id: '1422_149735_000006_000000'
+  '14_212_000019_000000':
+    speaker_id: '14'
+    example_id: '14_212_000019_000000'
+  '1088_129236_000006_000007':
+    speaker_id: '1088'
+    example_id: '1088_129236_000006_000007'

app.py CHANGED Viewed

@@ -1,31 +1,39 @@
 import numpy as np
 from pathlib import Path
-import padertorch as pt
 import paderbox as pb
-import time
 import torch
-import torchaudio
 from onnxruntime import InferenceSession
 from pvq_manipulation.models.vits import Vits_NT
 from pvq_manipulation.models.ffjord import FFJORD
-from IPython.display import display, Audio, clear_output
 from pvq_manipulation.models.hubert import HubertExtractor, SID_LARGE_LAYER
 import librosa
 from pvq_manipulation.helper.vad import EnergyVAD
 import gradio as gr
-device = 'cpu'  #'cuda' if torch.cuda.is_available() else 'cpu'
-# load tts model
-storage_dir_tts = Path("./models/tts_model/")
-tts_model = Vits_NT.load_model(storage_dir_tts, "model.pt")
 # load normalizing flow
 storage_dir_normalizing_flow = Path("./models/norm_flow")
-speaker_conditioning = pb.io.load(storage_dir_normalizing_flow / "speaker_conditioning.json")
 normalizing_flow = FFJORD.load_model(storage_dir_normalizing_flow, checkpoint="model.pt", device=device)
 # load hubert features model
 hubert_model = HubertExtractor(
     layer=SID_LARGE_LAYER,
@@ -35,140 +43,157 @@ hubert_model = HubertExtractor(
     # storage_dir= # target storage dir hubert model
 )
-# example synthesis
-# speaker_id = 1034
-# example_id = "1034_121119_000028_000001"
-# wav_1 = tts_model.synthesize_from_example({
-#     'text' : "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
-#     'd_vector_storage_root': f"./Saved_models/Dataset/Embeddings/{speaker_id}/{example_id}.pth"
-# })
-# display(Audio(wav_1, rate=24_000, normalize=True))
-# manipulation block
 def get_manipulation(
-    d_vector,
     labels,
-    flow,
     tts_model,
     manipulation_idx=0,
     manipulation_fkt=1,
 ):
     labels_manipulated = labels.clone()
-    labels_manipulated[:,manipulation_idx] += manipulation_fkt
-    output_forward = flow.forward((d_vector.float(), labels))[0]
     sampled_class_manipulated = flow.sample((output_forward, labels_manipulated))[0]
     wav = tts_model.synthesize_from_example({
-        'text': "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
         'd_vector': d_vector.detach().numpy(),
         'd_vector_man': sampled_class_manipulated.detach().numpy(),
-    })
     return wav
-def extract_speaker_embedding(example):
-    observation, sr = pb.io.load_audio(example['audio_path']['observation'], return_sample_rate=True)
-    observation = librosa.resample(observation, orig_sr=sr, target_sr=16_000)
-    vad = EnergyVAD(sample_rate=16_000)
-    if observation.ndim == 1:
-        observation = observation[None, :]
-    observation = vad({'audio_data': observation})['audio_data']
-    with torch.no_grad():
-        example = tts_model.speaker_manager.prepare_example({'audio_data': {'observation': observation}, **example})
-        example = pt.data.utils.collate_fn([example])
-        example['features'] = torch.tensor(np.array(example['features']))
-        d_vector = tts_model.speaker_manager.forward(example)[0]
-    return d_vector
-# load speaker labels
-def load_speaker_labels(example, speaker_conditioning, reg_stor_dir=Path('./models/pvq_extractor/')):
-    audio, _ = torchaudio.load(example['audio_path']['observation'])
-    audio = audio.to(device)
-    num_samples = torch.tensor([audio.shape[-1]], device=device)
     providers = ["CPUExecutionProvider"]
     with torch.no_grad():
         features, seq_len = hubert_model(
-            audio,
-            24_000,
             sequence_lengths=num_samples,
         )
         features = np.mean(features.squeeze(0).detach().cpu().numpy(), axis=-1)
         pvqd_predictions = {}
-        for pvq in ['Breathiness', 'Loudness', 'Pitch', 'Resonance', 'Roughness', 'Strain', 'Weight']:
             with open(reg_stor_dir / f"{pvq}.onnx", "rb") as fid:
                 onnx = fid.read()
             sess = InferenceSession(onnx, providers=providers)
             pred = sess.run(None, {"X": features[None]})[0].squeeze(1)
             pvqd_predictions[pvq] = pred.tolist()[0]
-    labels = []
-    for key in speaker_conditioning:
-        labels.append(pvqd_predictions[key]/100)
-    return torch.tensor(labels)
-example = {
-    'audio_path': {'observation': "audio/1034_121119_000028_000001.wav"},
-    'speaker_id': 1034,
-    'example_id': "1034_121119_000028_000001",
-}
-labels = load_speaker_labels(example, speaker_conditioning)
-label_options = ['Weight', 'Resonance', 'Breathiness', 'Roughness', 'Loudness', 'Strain', 'Pitch']
-# print('Estimated PVQ strengths of input speaker:')
-# max_len = max(len(name) for name in label_options)
-# for label_name, pvq in zip(label_options, labels):
-    # print(f'{label_name:<{max_len}} : {pvq:6.2f}')
-def update_manipulation(manipulation_idx, manipulation_fkt):
-    d_vector = extract_speaker_embedding(example)
-    labels = load_speaker_labels(example, speaker_conditioning)
     wav_manipulated = get_manipulation(
-        # example=example,
-        d_vector=d_vector,
-        labels=labels[None, :],
         flow=normalizing_flow,
         tts_model=tts_model,
         manipulation_idx=manipulation_idx,
         manipulation_fkt=manipulation_fkt,
     )
-    wav_unmanipulated = tts_model.synthesize_from_example({
-        'text': "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
-        'd_vector': d_vector.detach().numpy(),
-    })
-    sr = 24_000
-    return (sr, wav_unmanipulated), (sr, wav_manipulated)
-    # with audio_output:
-    #     clear_output(wait=True)
-    #     print('Manipulated Speaker')
-    #     display(Audio(wav_manipulated, rate=24_000, normalize=True))
-    #     print('Unmanipulated Synthese')
-    #     display(Audio(wav_unmanipulated, rate=24_000, normalize=True))
-    #     print('Original Speaker')
-    #     display(Audio(example['audio_path']['observation'], rate=24_000, normalize=True))
-    # print(f"Manipulated {label_options[manipulation_idx]} with strength {manipulation_fkt}")
-dropdown_options = [(label, i) for i, label in enumerate(label_options)]
 demo = gr.Interface(
     title="Perceptual Voice Quality (PVQ) Manipulation",
     fn=update_manipulation,
     inputs=[
-        gr.Dropdown(label="PVQ Feature", choices=dropdown_options, value=2, type="index"),
-        gr.Slider(label="Manipulation Factor", minimum=-2.0, maximum=2.0, value=1.0, step=0.1),
     ],
     outputs=[gr.Audio(label="original utterance"), gr.Audio(label="manipulated utterance")],
 )

 import numpy as np
 from pathlib import Path
 import paderbox as pb
 import torch
 from onnxruntime import InferenceSession
 from pvq_manipulation.models.vits import Vits_NT
 from pvq_manipulation.models.ffjord import FFJORD
 from pvq_manipulation.models.hubert import HubertExtractor, SID_LARGE_LAYER
 import librosa
 from pvq_manipulation.helper.vad import EnergyVAD
 import gradio as gr
+from pvq_manipulation.helper.creapy_wrapper import process_file
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+pvq_labels = ['Weight', 'Resonance', 'Breathiness', 'Roughness', 'Loudness', 'Strain', 'Pitch']
+dataset_dict = pb.io.load_yaml('./Dataset/dataset.yaml')
+cached_example_id = None
+cached_loaded_example = None
+cached_labels = None
+cached_d_vector = None
+cached_unmanipulated = None
+# path to stats
+stats_path = Path('./Dataset/Embeddings/')
 # load normalizing flow
 storage_dir_normalizing_flow = Path("./models/norm_flow")
+config_norm_flow = pb.io.load_yaml(storage_dir_normalizing_flow / "config.json")
 normalizing_flow = FFJORD.load_model(storage_dir_normalizing_flow, checkpoint="model.pt", device=device)
+# load tts model
+storage_dir_tts = Path("./models/tts_model/")
+tts_model = Vits_NT.load_model(storage_dir_tts, "model.pt")
 # load hubert features model
 hubert_model = HubertExtractor(
     layer=SID_LARGE_LAYER,
     # storage_dir= # target storage dir hubert model
 )
 def get_manipulation(
+    example,
     labels,
+    flow,
     tts_model,
+    d_vector,
+    config_norm_flow,
     manipulation_idx=0,
     manipulation_fkt=1,
 ):
     labels_manipulated = labels.clone()
+    labels_manipulated[:, manipulation_idx] += manipulation_fkt
+    if config_norm_flow['flag_remove_mean']:
+        global_mean = pb.io.load(stats_path / "mean.json")
+        global_mean = torch.tensor(global_mean, dtype=torch.float32)
+        speaker_embedding_norm = (d_vector - global_mean)
+        global_std = pb.io.load(stats_path / "std.json")
+        global_std = torch.tensor(global_std, dtype=torch.float32)
+        speaker_embedding_norm = speaker_embedding_norm / global_std
+    else:
+        speaker_embedding_norm = d_vector
+    output_forward = flow.forward((speaker_embedding_norm.float(), labels))[0]
     sampled_class_manipulated = flow.sample((output_forward, labels_manipulated))[0]
+    if config_norm_flow['flag_remove_mean']:
+        sampled_class_manipulated = (sampled_class_manipulated * global_std + global_mean)
     wav = tts_model.synthesize_from_example({
+        'text': example['transcription'],
         'd_vector': d_vector.detach().numpy(),
         'd_vector_man': sampled_class_manipulated.detach().numpy(),
+        'd_vector_storage_root': example['d_vector_storage_root'],
+    })
     return wav
+def get_creak_label(example):
+    audio_data = example['loaded_audio_data']['16_000']
+    test, y_pred, included_indices = process_file(audio_data)
+    mean_creak = np.mean(y_pred[included_indices])
+    return mean_creak * 100
+def load_speaker_labels(example, reg_stor_dir=Path('./models/pvq_extractor/')):
+    audio_data = torch.tensor(example['loaded_audio_data']['16_000'], dtype=torch.float)[None, :]
+    num_samples = torch.tensor([audio_data.shape[-1]])
+    if torch.cuda.is_available():
+        audio_data = audio_data.cuda()
+        num_samples = num_samples.cuda()
     providers = ["CPUExecutionProvider"]
     with torch.no_grad():
         features, seq_len = hubert_model(
+            audio_data,
+            16_000,
             sequence_lengths=num_samples,
         )
         features = np.mean(features.squeeze(0).detach().cpu().numpy(), axis=-1)
         pvqd_predictions = {}
+        for pvq in pvq_labels:
             with open(reg_stor_dir / f"{pvq}.onnx", "rb") as fid:
                 onnx = fid.read()
             sess = InferenceSession(onnx, providers=providers)
             pred = sess.run(None, {"X": features[None]})[0].squeeze(1)
             pvqd_predictions[pvq] = pred.tolist()[0]
+    pvqd_predictions['Creak_mean'] = get_creak_label(example)
+    labels = [pvqd_predictions[key] / 100 for key in pvq_labels + ["Creak_mean"]]
+    return torch.tensor(labels, device=device).float()
+def load_audio_files(example):
+    observation_loaded, sr = pb.io.load_audio(example['audio_path']['observation'], return_sample_rate=True)
+    example['loaded_audio_data'] = {}
+    observation = librosa.resample(observation_loaded, orig_sr=sr, target_sr=16_000)
+    vad = EnergyVAD(sample_rate=16_000)
+    if observation.ndim == 1:
+        observation = observation[None, :]
+    observation = vad({'audio_data': observation})['audio_data']
+    example['loaded_audio_data']['16_000'] = observation
+    observation = librosa.resample(observation, orig_sr=sr, target_sr=24_000)
+    vad = EnergyVAD(sample_rate=24_000)
+    if observation.ndim == 1:
+        observation = observation[None, :]
+    observation = vad({'audio_data': observation})['audio_data']
+    example['loaded_audio_data']['24_000'] = observation
+    return example
+def update_manipulation(manipulation_idx, example_id, transcription, manipulation_fkt):
+    global cached_example_id, cached_loaded_example, cached_labels, cached_d_vector, example_database, cached_unmanipulated
+    speaker_id = dataset_dict['dataset'][example_id]['speaker_id']
+    example = {
+        'audio_path': {'observation': f"./Dataset/Audio_files/{example_id}.wav"},
+        'd_vector_storage_root': f"./Saved_models/Dataset/Embeddings/{speaker_id}/{example_id}.pth",
+        'speaker_id': speaker_id,
+        'example_id': example_id,
+        'transcription': transcription
+    }
+    if cached_example_id != example_id:
+        cached_loaded_example = load_audio_files(example)
+        cached_d_vector = torch.load(f"./Dataset/Embeddings/{speaker_id}/{example_id}.pth")
+        cached_labels = load_speaker_labels(example)
+        cached_example_id = example_id
+        cached_unmanipulated = tts_model.synthesize_from_example({
+            'text': transcription,
+            'd_vector': cached_d_vector.detach().numpy(),
+        })
     wav_manipulated = get_manipulation(
+        example=example,
+        d_vector=cached_d_vector,
+        labels=cached_labels[None, :],
         flow=normalizing_flow,
         tts_model=tts_model,
         manipulation_idx=manipulation_idx,
         manipulation_fkt=manipulation_fkt,
+        config_norm_flow=config_norm_flow,
     )
+    return (24_000, cached_unmanipulated), (24_000, wav_manipulated)
 demo = gr.Interface(
     title="Perceptual Voice Quality (PVQ) Manipulation",
     fn=update_manipulation,
     inputs=[
+        gr.Dropdown(
+            label="PVQ Feature",
+            choices=[('Weight', 0), ('Resonance', 1), ('Breathiness', 2), ('Roughness', 3), ('Creak', 7)],
+            value=2, type="value"
+        ),
+        gr.Dropdown(
+            choices=dataset_dict['dataset'].keys(),
+            value='1422_149735_000006_000000', type="value"
+        ),
+        gr.Textbox(
+            value="It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
+            placeholder='Type something'
+        ),
+        gr.Slider(label="Manipulation Factor", minimum=-1.0, maximum=2.0, value=1.0, step=0.1),
     ],
     outputs=[gr.Audio(label="original utterance"), gr.Audio(label="manipulated utterance")],
 )

models/norm_flow/config.json CHANGED Viewed

@@ -1,12 +1,15 @@
 {
-  "factory": "pvq_manipulation.models.ffjord.FFJORD",
-  "normalize": true,
-  "ode_function": {
-    "condition_dim": 7,
-    "factory": "pvq_manipulation.models.ode_functions.CNFNN",
-    "hidden_channels": [
-      512
-    ],
-    "input_dim": 256
-  }
-}

 {
+  "model":{
+    "factory": "pvq_manipulation.models.ffjord.FFJORD",
+    "normalize": true,
+    "ode_function": {
+      "condition_dim": 8,
+      "factory": "pvq_manipulation.models.ode_functions.CNFNN",
+      "hidden_channels": [
+        512
+      ],
+      "input_dim": 256
+    }
+  },
+  "flag_remove_mean": true
+}