SandraCLV commited on
Commit
5f00263
·
1 Parent(s): 2dfdea5

Upload model.py

Browse files
Files changed (1) hide show
  1. model.py +286 -0
model.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2022-2023 Xiaomi Corp. (authors: Fangjun Kuang)
2
+ #
3
+ # See LICENSE for clarification regarding multiple authors
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ from functools import lru_cache
18
+
19
+ import sherpa_onnx
20
+ from huggingface_hub import hf_hub_download
21
+
22
+
23
+ def get_file(
24
+ repo_id: str,
25
+ filename: str,
26
+ subfolder: str = ".",
27
+ ) -> str:
28
+ model_filename = hf_hub_download(
29
+ repo_id=repo_id,
30
+ filename=filename,
31
+ subfolder=subfolder,
32
+ )
33
+ return model_filename
34
+
35
+
36
+ @lru_cache(maxsize=10)
37
+ def _get_vits_vctk(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts:
38
+ assert repo_id == "csukuangfj/vits-vctk"
39
+
40
+ model = get_file(
41
+ repo_id=repo_id,
42
+ filename="vits-vctk.onnx",
43
+ subfolder=".",
44
+ )
45
+
46
+ lexicon = get_file(
47
+ repo_id=repo_id,
48
+ filename="lexicon.txt",
49
+ subfolder=".",
50
+ )
51
+
52
+ tokens = get_file(
53
+ repo_id=repo_id,
54
+ filename="tokens.txt",
55
+ subfolder=".",
56
+ )
57
+
58
+ tts_config = sherpa_onnx.OfflineTtsConfig(
59
+ model=sherpa_onnx.OfflineTtsModelConfig(
60
+ vits=sherpa_onnx.OfflineTtsVitsModelConfig(
61
+ model=model,
62
+ lexicon=lexicon,
63
+ tokens=tokens,
64
+ length_scale=1.0 / speed,
65
+ ),
66
+ provider="cpu",
67
+ debug=True,
68
+ num_threads=2,
69
+ )
70
+ )
71
+ tts = sherpa_onnx.OfflineTts(tts_config)
72
+
73
+ return tts
74
+
75
+
76
+ @lru_cache(maxsize=10)
77
+ def _get_vits_ljs(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts:
78
+ assert repo_id == "csukuangfj/vits-ljs"
79
+
80
+ model = get_file(
81
+ repo_id=repo_id,
82
+ filename="vits-ljs.onnx",
83
+ subfolder=".",
84
+ )
85
+
86
+ lexicon = get_file(
87
+ repo_id=repo_id,
88
+ filename="lexicon.txt",
89
+ subfolder=".",
90
+ )
91
+
92
+ tokens = get_file(
93
+ repo_id=repo_id,
94
+ filename="tokens.txt",
95
+ subfolder=".",
96
+ )
97
+
98
+ tts_config = sherpa_onnx.OfflineTtsConfig(
99
+ model=sherpa_onnx.OfflineTtsModelConfig(
100
+ vits=sherpa_onnx.OfflineTtsVitsModelConfig(
101
+ model=model,
102
+ lexicon=lexicon,
103
+ tokens=tokens,
104
+ length_scale=1.0 / speed,
105
+ ),
106
+ provider="cpu",
107
+ debug=True,
108
+ num_threads=2,
109
+ )
110
+ )
111
+ tts = sherpa_onnx.OfflineTts(tts_config)
112
+
113
+ return tts
114
+
115
+
116
+ @lru_cache(maxsize=10)
117
+ def _get_vits_piper(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts:
118
+ n = len("vits-piper-")
119
+ name = repo_id.split("/")[1][n:]
120
+
121
+ model = get_file(
122
+ repo_id=repo_id,
123
+ filename=f"{name}.onnx",
124
+ subfolder=".",
125
+ )
126
+
127
+ lexicon = get_file(
128
+ repo_id=repo_id,
129
+ filename="lexicon.txt",
130
+ subfolder=".",
131
+ )
132
+
133
+ tokens = get_file(
134
+ repo_id=repo_id,
135
+ filename="tokens.txt",
136
+ subfolder=".",
137
+ )
138
+
139
+ tts_config = sherpa_onnx.OfflineTtsConfig(
140
+ model=sherpa_onnx.OfflineTtsModelConfig(
141
+ vits=sherpa_onnx.OfflineTtsVitsModelConfig(
142
+ model=model,
143
+ lexicon=lexicon,
144
+ tokens=tokens,
145
+ length_scale=1.0 / speed,
146
+ ),
147
+ provider="cpu",
148
+ debug=True,
149
+ num_threads=2,
150
+ )
151
+ )
152
+ tts = sherpa_onnx.OfflineTts(tts_config)
153
+
154
+ return tts
155
+
156
+
157
+ @lru_cache(maxsize=10)
158
+ def _get_vits_zh_aishell3(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts:
159
+ assert repo_id == "csukuangfj/vits-zh-aishell3"
160
+
161
+ model = get_file(
162
+ repo_id=repo_id,
163
+ filename="vits-aishell3.onnx",
164
+ subfolder=".",
165
+ )
166
+
167
+ lexicon = get_file(
168
+ repo_id=repo_id,
169
+ filename="lexicon.txt",
170
+ subfolder=".",
171
+ )
172
+
173
+ tokens = get_file(
174
+ repo_id=repo_id,
175
+ filename="tokens.txt",
176
+ subfolder=".",
177
+ )
178
+
179
+ tts_config = sherpa_onnx.OfflineTtsConfig(
180
+ model=sherpa_onnx.OfflineTtsModelConfig(
181
+ vits=sherpa_onnx.OfflineTtsVitsModelConfig(
182
+ model=model,
183
+ lexicon=lexicon,
184
+ tokens=tokens,
185
+ length_scale=1.0 / speed,
186
+ ),
187
+ provider="cpu",
188
+ debug=True,
189
+ num_threads=2,
190
+ )
191
+ )
192
+ tts = sherpa_onnx.OfflineTts(tts_config)
193
+
194
+ return tts
195
+
196
+
197
+ @lru_cache(maxsize=10)
198
+ def get_pretrained_model(repo_id: str, speed: float) -> sherpa_onnx.OfflineTts:
199
+ if repo_id in chinese_models:
200
+ return chinese_models[repo_id](repo_id, speed)
201
+ elif repo_id in english_models:
202
+ return english_models[repo_id](repo_id, speed)
203
+ elif repo_id in german_models:
204
+ return german_models[repo_id](repo_id, speed)
205
+ elif repo_id in spanish_models:
206
+ return spanish_models[repo_id](repo_id, speed)
207
+ elif repo_id in french_models:
208
+ return french_models[repo_id](repo_id, speed)
209
+ else:
210
+ raise ValueError(f"Unsupported repo_id: {repo_id}")
211
+
212
+
213
+ chinese_models = {
214
+ "csukuangfj/vits-zh-aishell3": _get_vits_zh_aishell3,
215
+ # "csukuangfj/vits-piper-zh_CN-huayan-x_low": _get_vits_piper,
216
+ # "csukuangfj/vits-piper-zh_CN-huayan-medium": _get_vits_piper,
217
+ }
218
+
219
+ english_models = {
220
+ "csukuangfj/vits-vctk": _get_vits_vctk, # 109 speakers
221
+ "csukuangfj/vits-ljs": _get_vits_ljs,
222
+ # piper, US
223
+ "csukuangfj/vits-piper-en_US-amy-low": _get_vits_piper,
224
+ "csukuangfj/vits-piper-en_US-amy-medium": _get_vits_piper,
225
+ "csukuangfj/vits-piper-en_US-arctic-medium": _get_vits_piper, # 18 speakers
226
+ "csukuangfj/vits-piper-en_US-danny-low": _get_vits_piper,
227
+ "csukuangfj/vits-piper-en_US-hfc_male-medium": _get_vits_piper,
228
+ "csukuangfj/vits-piper-en_US-joe-medium": _get_vits_piper,
229
+ "csukuangfj/vits-piper-en_US-kathleen-low": _get_vits_piper,
230
+ "csukuangfj/vits-piper-en_US-kusal-medium": _get_vits_piper,
231
+ "csukuangfj/vits-piper-en_US-l2arctic-medium": _get_vits_piper, # 24 speakers
232
+ "csukuangfj/vits-piper-en_US-lessac-low": _get_vits_piper,
233
+ "csukuangfj/vits-piper-en_US-lessac-medium": _get_vits_piper,
234
+ "csukuangfj/vits-piper-en_US-lessac-high": _get_vits_piper,
235
+ "csukuangfj/vits-piper-en_US-libritts-high": _get_vits_piper, # 904 speakers
236
+ "csukuangfj/vits-piper-en_US-libritts_r-medium": _get_vits_piper, # 904 speakers
237
+ "csukuangfj/vits-piper-en_US-ryan-low": _get_vits_piper,
238
+ "csukuangfj/vits-piper-en_US-ryan-medium": _get_vits_piper,
239
+ "csukuangfj/vits-piper-en_US-ryan-high": _get_vits_piper,
240
+ # piper, GB
241
+ "csukuangfj/vits-piper-en_GB-alan-low": _get_vits_piper,
242
+ "csukuangfj/vits-piper-en_GB-alan-medium": _get_vits_piper,
243
+ "csukuangfj/vits-piper-en_GB-alba-medium": _get_vits_piper,
244
+ "csukuangfj/vits-piper-en_GB-jenny_dioco-medium": _get_vits_piper,
245
+ "csukuangfj/vits-piper-en_GB-northern_english_male-medium": _get_vits_piper,
246
+ "csukuangfj/vits-piper-en_GB-semaine-medium": _get_vits_piper,
247
+ "csukuangfj/vits-piper-en_GB-southern_english_female-low": _get_vits_piper,
248
+ "csukuangfj/vits-piper-en_GB-vctk-medium": _get_vits_piper,
249
+ }
250
+
251
+ german_models = {
252
+ "csukuangfj/vits-piper-de_DE-eva_k-x_low": _get_vits_piper,
253
+ "csukuangfj/vits-piper-de_DE-karlsson-low": _get_vits_piper,
254
+ "csukuangfj/vits-piper-de_DE-kerstin-low": _get_vits_piper,
255
+ "csukuangfj/vits-piper-de_DE-pavoque-low": _get_vits_piper,
256
+ "csukuangfj/vits-piper-de_DE-ramona-low": _get_vits_piper,
257
+ "csukuangfj/vits-piper-de_DE-thorsten-low": _get_vits_piper,
258
+ "csukuangfj/vits-piper-de_DE-thorsten-medium": _get_vits_piper,
259
+ "csukuangfj/vits-piper-de_DE-thorsten-high": _get_vits_piper,
260
+ "csukuangfj/vits-piper-de_DE-thorsten_emotional-medium": _get_vits_piper, # 8 speakers
261
+ }
262
+
263
+ spanish_models = {
264
+ "csukuangfj/vits-piper-es_ES-carlfm-x_low": _get_vits_piper,
265
+ "csukuangfj/vits-piper-es_ES-davefx-medium": _get_vits_piper,
266
+ "csukuangfj/vits-piper-es_ES-mls_10246-low": _get_vits_piper,
267
+ "csukuangfj/vits-piper-es_ES-mls_9972-low": _get_vits_piper,
268
+ "csukuangfj/vits-piper-es_ES-sharvard-medium": _get_vits_piper, # 2 speakers
269
+ "csukuangfj/vits-piper-es_MX-ald-medium": _get_vits_piper,
270
+ }
271
+
272
+ french_models = {
273
+ # "csukuangfj/vits-piper-fr_FR-gilles-low": _get_vits_piper,
274
+ # "csukuangfj/vits-piper-fr_FR-mls_1840-low": _get_vits_piper,
275
+ "csukuangfj/vits-piper-fr_FR-upmc-medium": _get_vits_piper, # 2 speakers, 0-femal, 1-male
276
+ "csukuangfj/vits-piper-fr_FR-siwis-low": _get_vits_piper, # female
277
+ "csukuangfj/vits-piper-fr_FR-siwis-medium": _get_vits_piper,
278
+ }
279
+
280
+ language_to_models = {
281
+ "English": list(english_models.keys()),
282
+ "Chinese": list(chinese_models.keys()),
283
+ "German": list(german_models.keys()),
284
+ "Spanish": list(spanish_models.keys()),
285
+ "French": list(french_models.keys()),
286
+ }