Spaces:
Running
on
Zero
Running
on
Zero
Add more voice clone voices; Update model names; Update playground
Browse files- .gitattributes +8 -1
- app.py +120 -22
- higgs_audio/serve/serve_engine.py +50 -0
- voice_examples/{wizard.wav → belinda.wav} +2 -2
- voice_examples/broom_salesman.wav +3 -0
- voice_examples/chadwick.wav +3 -0
- voice_examples/config.json +31 -3
- voice_examples/en_man.wav +3 -0
- voice_examples/en_woman.wav +3 -0
- voice_examples/mabel.wav +3 -0
- voice_examples/vex.wav +3 -0
- voice_examples/zh_man_sichuan.wav +3 -0
.gitattributes
CHANGED
@@ -33,4 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
-
voice_examples/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
voice_examples/en_woman.wav filter=lfs diff=lfs merge=lfs -text
|
37 |
+
voice_examples/mabel.wav filter=lfs diff=lfs merge=lfs -text
|
38 |
+
voice_examples/vex.wav filter=lfs diff=lfs merge=lfs -text
|
39 |
+
voice_examples/zh_man_sichuan.wav filter=lfs diff=lfs merge=lfs -text
|
40 |
+
voice_examples/belinda.wav filter=lfs diff=lfs merge=lfs -text
|
41 |
+
voice_examples/broom_salesman.wav filter=lfs diff=lfs merge=lfs -text
|
42 |
+
voice_examples/chadwick.wav filter=lfs diff=lfs merge=lfs -text
|
43 |
+
voice_examples/en_man.wav filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -25,8 +25,8 @@ from higgs_audio.data_types import ChatMLSample, AudioContent, Message
|
|
25 |
engine = None
|
26 |
|
27 |
# Default model configuration
|
28 |
-
DEFAULT_MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-
|
29 |
-
DEFAULT_AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer
|
30 |
SAMPLE_RATE = 24000
|
31 |
|
32 |
DEFAULT_SYSTEM_PROMPT = (
|
@@ -43,46 +43,54 @@ PREDEFINED_EXAMPLES = {
|
|
43 |
"voice-clone": {
|
44 |
"system_prompt": "",
|
45 |
"input_text": "Hey there! I'm your friendly voice twin in the making. Pick a voice preset below or upload your own audio - let's clone some vocals and bring your voice to life! ",
|
46 |
-
"description": "Voice clone
|
47 |
},
|
48 |
-
"
|
49 |
"system_prompt": DEFAULT_SYSTEM_PROMPT,
|
50 |
-
"input_text": "
|
51 |
-
"description": "
|
52 |
},
|
53 |
-
"multispeaker-
|
54 |
-
"system_prompt": "
|
|
|
|
|
55 |
"<|scene_desc_start|>\n"
|
56 |
-
"SPEAKER0:
|
57 |
-
"SPEAKER1: masculine
|
58 |
-
"In this scene, a group of adventurers is debating whether to investigate a potentially dangerous situation.\n"
|
59 |
"<|scene_desc_end|>",
|
60 |
-
"input_text": "
|
61 |
-
"
|
|
|
|
|
|
|
62 |
},
|
63 |
-
"single-speaker-
|
64 |
"system_prompt": "Generate audio following instruction.\n\n"
|
65 |
"<|scene_desc_start|>\n"
|
66 |
-
"SPEAKER0: British accent
|
67 |
"<|scene_desc_end|>",
|
68 |
"input_text": "Hey, everyone! Welcome back to Tech Talk Tuesdays.\n"
|
69 |
"It's your host, Alex, and today, we're diving into a topic that's become absolutely crucial in the tech world — deep learning.\n"
|
70 |
"And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.\n"
|
71 |
"\n"
|
72 |
"So here's the big question: Do you want to understand how deep learning works?\n",
|
73 |
-
"description": "Single speaker
|
74 |
},
|
75 |
"single-speaker-zh": {
|
76 |
"system_prompt": "Generate audio following instruction.\n\n"
|
77 |
"<|scene_desc_start|>\n"
|
78 |
-
"
|
79 |
-
"\nSPEAKER0: feminine\n"
|
80 |
"<|scene_desc_end|>",
|
81 |
"input_text": "大家好, 欢迎收听本期的跟李沐学AI. 今天沐哥在忙着洗数据, 所以由我, 希格斯主播代替他讲这期视频.\n"
|
82 |
"今天我们要聊的是一个你绝对不能忽视的话题: 多模态学习.\n"
|
83 |
"那么, 问题来了, 你真的了解多模态吗? 你知道如何自己动手构建多模态大模型吗.\n"
|
84 |
"或者说, 你能察觉到我其实是个机器人吗?",
|
85 |
-
"description": "Single speaker
|
|
|
|
|
|
|
|
|
|
|
86 |
},
|
87 |
}
|
88 |
|
@@ -130,6 +138,62 @@ def get_voice_present(voice_preset):
|
|
130 |
return voice_path, text
|
131 |
|
132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
@spaces.GPU
|
134 |
def initialize_engine(model_path, audio_tokenizer_path) -> bool:
|
135 |
"""Initialize the HiggsAudioServeEngine."""
|
@@ -200,6 +264,7 @@ def prepare_chatml_sample(
|
|
200 |
messages.append(Message(role="assistant", content=[audio_content]))
|
201 |
|
202 |
# Add the main user message
|
|
|
203 |
messages.append(Message(role="user", content=text))
|
204 |
|
205 |
return ChatMLSample(messages=messages)
|
@@ -217,6 +282,8 @@ def text_to_speech(
|
|
217 |
top_k=50,
|
218 |
system_prompt=DEFAULT_SYSTEM_PROMPT,
|
219 |
stop_strings=None,
|
|
|
|
|
220 |
):
|
221 |
"""Convert text to speech using HiggsAudioServeEngine."""
|
222 |
global engine
|
@@ -237,7 +304,8 @@ def text_to_speech(
|
|
237 |
request_id = f"tts-playground-{str(uuid.uuid4())}"
|
238 |
logger.info(
|
239 |
f"{request_id}: Generating speech for text: {text[:100]}..., \n"
|
240 |
-
f"with parameters: temperature={temperature}, top_p={top_p}, top_k={top_k}, stop_list={stop_list}"
|
|
|
241 |
)
|
242 |
start_time = time.time()
|
243 |
|
@@ -249,6 +317,8 @@ def text_to_speech(
|
|
249 |
top_k=top_k if top_k > 0 else None,
|
250 |
top_p=top_p,
|
251 |
stop_strings=stop_list,
|
|
|
|
|
252 |
)
|
253 |
|
254 |
generation_time = time.time() - start_time
|
@@ -312,7 +382,7 @@ def create_ui():
|
|
312 |
}
|
313 |
"""
|
314 |
|
315 |
-
default_template = "
|
316 |
|
317 |
"""Create the Gradio UI."""
|
318 |
with gr.Blocks(theme=my_theme, css=custom_css) as demo:
|
@@ -329,6 +399,12 @@ def create_ui():
|
|
329 |
info="Select a predefined example for system and input messages. Voice preset will be set to EMPTY when a example is selected.",
|
330 |
)
|
331 |
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
system_prompt = gr.TextArea(
|
333 |
label="System Prompt",
|
334 |
placeholder="Enter system prompt to guide the model...",
|
@@ -378,6 +454,22 @@ def create_ui():
|
|
378 |
)
|
379 |
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top P")
|
380 |
top_k = gr.Slider(minimum=-1, maximum=100, value=50, step=1, label="Top K")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
381 |
# Add stop strings component
|
382 |
stop_strings = gr.Dataframe(
|
383 |
label="Stop Strings",
|
@@ -437,10 +529,12 @@ def create_ui():
|
|
437 |
template = PREDEFINED_EXAMPLES[template_name]
|
438 |
# Enable voice preset and custom reference only for voice-clone template
|
439 |
is_voice_clone = template_name == "voice-clone"
|
440 |
-
voice_preset_value = "
|
|
|
441 |
return (
|
442 |
template["system_prompt"], # system_prompt
|
443 |
template["input_text"], # input_text
|
|
|
444 |
gr.update(
|
445 |
value=voice_preset_value, interactive=is_voice_clone, visible=is_voice_clone
|
446 |
), # voice_preset (value and interactivity)
|
@@ -454,6 +548,7 @@ def create_ui():
|
|
454 |
gr.update(),
|
455 |
gr.update(),
|
456 |
gr.update(),
|
|
|
457 |
) # No change if template not found
|
458 |
|
459 |
# Set up event handlers
|
@@ -465,6 +560,7 @@ def create_ui():
|
|
465 |
outputs=[
|
466 |
system_prompt,
|
467 |
input_text,
|
|
|
468 |
voice_preset,
|
469 |
custom_reference_accordion,
|
470 |
voice_samples_section,
|
@@ -485,6 +581,8 @@ def create_ui():
|
|
485 |
top_k,
|
486 |
system_prompt,
|
487 |
stop_strings,
|
|
|
|
|
488 |
],
|
489 |
outputs=[output_text, output_audio],
|
490 |
api_name="generate_speech",
|
|
|
25 |
engine = None
|
26 |
|
27 |
# Default model configuration
|
28 |
+
DEFAULT_MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base"
|
29 |
+
DEFAULT_AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer"
|
30 |
SAMPLE_RATE = 24000
|
31 |
|
32 |
DEFAULT_SYSTEM_PROMPT = (
|
|
|
43 |
"voice-clone": {
|
44 |
"system_prompt": "",
|
45 |
"input_text": "Hey there! I'm your friendly voice twin in the making. Pick a voice preset below or upload your own audio - let's clone some vocals and bring your voice to life! ",
|
46 |
+
"description": "Voice clone to clone the reference audio. Leave the system prompt empty.",
|
47 |
},
|
48 |
+
"smart-voice": {
|
49 |
"system_prompt": DEFAULT_SYSTEM_PROMPT,
|
50 |
+
"input_text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years.",
|
51 |
+
"description": "Smart voice to generate speech based on the context",
|
52 |
},
|
53 |
+
"multispeaker-voice-description": {
|
54 |
+
"system_prompt": "You are an AI assistant designed to convert text into speech.\n"
|
55 |
+
"If the user's message includes a [SPEAKER*] tag, do not read out the tag and generate speech for the following text, using the specified voice.\n"
|
56 |
+
"If no speaker tag is present, select a suitable voice on your own.\n\n"
|
57 |
"<|scene_desc_start|>\n"
|
58 |
+
"SPEAKER0: feminine\n"
|
59 |
+
"SPEAKER1: masculine\n"
|
|
|
60 |
"<|scene_desc_end|>",
|
61 |
+
"input_text": "[SPEAKER0] I can't believe you did that without even asking me first!\n"
|
62 |
+
"[SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this.\n"
|
63 |
+
"[SPEAKER0] Overreact? You made a decision that affects both of us without even considering my opinion!\n"
|
64 |
+
"[SPEAKER1] Because I didn't have time to sit around waiting for you to make up your mind! Someone had to act.",
|
65 |
+
"description": "Multispeaker with different voice descriptions in the system prompt",
|
66 |
},
|
67 |
+
"single-speaker-voice-description": {
|
68 |
"system_prompt": "Generate audio following instruction.\n\n"
|
69 |
"<|scene_desc_start|>\n"
|
70 |
+
"SPEAKER0: He speaks with a clear British accent and a conversational, inquisitive tone. His delivery is articulate and at a moderate pace, and very clear audio.\n"
|
71 |
"<|scene_desc_end|>",
|
72 |
"input_text": "Hey, everyone! Welcome back to Tech Talk Tuesdays.\n"
|
73 |
"It's your host, Alex, and today, we're diving into a topic that's become absolutely crucial in the tech world — deep learning.\n"
|
74 |
"And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.\n"
|
75 |
"\n"
|
76 |
"So here's the big question: Do you want to understand how deep learning works?\n",
|
77 |
+
"description": "Single speaker with voice description in the system prompt",
|
78 |
},
|
79 |
"single-speaker-zh": {
|
80 |
"system_prompt": "Generate audio following instruction.\n\n"
|
81 |
"<|scene_desc_start|>\n"
|
82 |
+
"Audio is recorded from a quiet room.\n"
|
|
|
83 |
"<|scene_desc_end|>",
|
84 |
"input_text": "大家好, 欢迎收听本期的跟李沐学AI. 今天沐哥在忙着洗数据, 所以由我, 希格斯主播代替他讲这期视频.\n"
|
85 |
"今天我们要聊的是一个你绝对不能忽视的话题: 多模态学习.\n"
|
86 |
"那么, 问题来了, 你真的了解多模态吗? 你知道如何自己动手构建多模态大模型吗.\n"
|
87 |
"或者说, 你能察觉到我其实是个机器人吗?",
|
88 |
+
"description": "Single speaker speaking Chinese",
|
89 |
+
},
|
90 |
+
"single-speaker-bgm": {
|
91 |
+
"system_prompt": DEFAULT_SYSTEM_PROMPT,
|
92 |
+
"input_text": "<SE_s>[Music]</SE_s> I will remember this, thought Ender, when I am defeated. To keep dignity, and give honor where it's due, so that defeat is not disgrace. And I hope I don't have to do it often. <SE_e>[Music]</SE_e>",
|
93 |
+
"description": "Single speaker with BGM using music tag. This is an experimental feature and may need to try multiple times to get the best result.",
|
94 |
},
|
95 |
}
|
96 |
|
|
|
138 |
return voice_path, text
|
139 |
|
140 |
|
141 |
+
def normalize_chinese_punctuation(text):
|
142 |
+
"""
|
143 |
+
Convert Chinese (full-width) punctuation marks to English (half-width) equivalents.
|
144 |
+
"""
|
145 |
+
# Mapping of Chinese punctuation to English punctuation
|
146 |
+
chinese_to_english_punct = {
|
147 |
+
",": ", ", # comma
|
148 |
+
"。": ".", # period
|
149 |
+
":": ":", # colon
|
150 |
+
";": ";", # semicolon
|
151 |
+
"?": "?", # question mark
|
152 |
+
"!": "!", # exclamation mark
|
153 |
+
"(": "(", # left parenthesis
|
154 |
+
")": ")", # right parenthesis
|
155 |
+
"【": "[", # left square bracket
|
156 |
+
"】": "]", # right square bracket
|
157 |
+
"《": "<", # left angle quote
|
158 |
+
"》": ">", # right angle quote
|
159 |
+
"“": '"', # left double quotation
|
160 |
+
"”": '"', # right double quotation
|
161 |
+
"‘": "'", # left single quotation
|
162 |
+
"’": "'", # right single quotation
|
163 |
+
"、": ",", # enumeration comma
|
164 |
+
"—": "-", # em dash
|
165 |
+
"…": "...", # ellipsis
|
166 |
+
"·": ".", # middle dot
|
167 |
+
"「": '"', # left corner bracket
|
168 |
+
"」": '"', # right corner bracket
|
169 |
+
"『": '"', # left double corner bracket
|
170 |
+
"』": '"', # right double corner bracket
|
171 |
+
}
|
172 |
+
|
173 |
+
# Replace each Chinese punctuation with its English counterpart
|
174 |
+
for zh_punct, en_punct in chinese_to_english_punct.items():
|
175 |
+
text = text.replace(zh_punct, en_punct)
|
176 |
+
|
177 |
+
return text
|
178 |
+
|
179 |
+
|
180 |
+
def normalize_text(transcript: str):
|
181 |
+
transcript = normalize_chinese_punctuation(transcript)
|
182 |
+
# Other normalizations (e.g., parentheses and other symbols. Will be improved in the future)
|
183 |
+
transcript = transcript.replace("(", " ")
|
184 |
+
transcript = transcript.replace(")", " ")
|
185 |
+
transcript = transcript.replace("°F", " degrees Fahrenheit")
|
186 |
+
transcript = transcript.replace("°C", " degrees Celsius")
|
187 |
+
lines = transcript.split("\n")
|
188 |
+
transcript = "\n".join([" ".join(line.split()) for line in lines if line.strip()])
|
189 |
+
transcript = transcript.strip()
|
190 |
+
|
191 |
+
if not any([transcript.endswith(c) for c in [".", "!", "?", ",", ";", '"', "'", "</SE_e>", "</SE>"]]):
|
192 |
+
transcript += "."
|
193 |
+
|
194 |
+
return transcript
|
195 |
+
|
196 |
+
|
197 |
@spaces.GPU
|
198 |
def initialize_engine(model_path, audio_tokenizer_path) -> bool:
|
199 |
"""Initialize the HiggsAudioServeEngine."""
|
|
|
264 |
messages.append(Message(role="assistant", content=[audio_content]))
|
265 |
|
266 |
# Add the main user message
|
267 |
+
text = normalize_text(text)
|
268 |
messages.append(Message(role="user", content=text))
|
269 |
|
270 |
return ChatMLSample(messages=messages)
|
|
|
282 |
top_k=50,
|
283 |
system_prompt=DEFAULT_SYSTEM_PROMPT,
|
284 |
stop_strings=None,
|
285 |
+
ras_win_len=20,
|
286 |
+
ras_win_max_num_repeat=2,
|
287 |
):
|
288 |
"""Convert text to speech using HiggsAudioServeEngine."""
|
289 |
global engine
|
|
|
304 |
request_id = f"tts-playground-{str(uuid.uuid4())}"
|
305 |
logger.info(
|
306 |
f"{request_id}: Generating speech for text: {text[:100]}..., \n"
|
307 |
+
f"with parameters: temperature={temperature}, top_p={top_p}, top_k={top_k}, stop_list={stop_list}, "
|
308 |
+
f"ras_win_len={ras_win_len}, ras_win_max_num_repeat={ras_win_max_num_repeat}"
|
309 |
)
|
310 |
start_time = time.time()
|
311 |
|
|
|
317 |
top_k=top_k if top_k > 0 else None,
|
318 |
top_p=top_p,
|
319 |
stop_strings=stop_list,
|
320 |
+
ras_win_len=ras_win_len if ras_win_len > 0 else None,
|
321 |
+
ras_win_max_num_repeat=max(ras_win_len, ras_win_max_num_repeat),
|
322 |
)
|
323 |
|
324 |
generation_time = time.time() - start_time
|
|
|
382 |
}
|
383 |
"""
|
384 |
|
385 |
+
default_template = "smart-voice"
|
386 |
|
387 |
"""Create the Gradio UI."""
|
388 |
with gr.Blocks(theme=my_theme, css=custom_css) as demo:
|
|
|
399 |
info="Select a predefined example for system and input messages. Voice preset will be set to EMPTY when a example is selected.",
|
400 |
)
|
401 |
|
402 |
+
# Template description display
|
403 |
+
template_description = gr.HTML(
|
404 |
+
value=f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {PREDEFINED_EXAMPLES[default_template]["description"]}</p>',
|
405 |
+
visible=True,
|
406 |
+
)
|
407 |
+
|
408 |
system_prompt = gr.TextArea(
|
409 |
label="System Prompt",
|
410 |
placeholder="Enter system prompt to guide the model...",
|
|
|
454 |
)
|
455 |
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top P")
|
456 |
top_k = gr.Slider(minimum=-1, maximum=100, value=50, step=1, label="Top K")
|
457 |
+
ras_win_len = gr.Slider(
|
458 |
+
minimum=0,
|
459 |
+
maximum=10,
|
460 |
+
value=0,
|
461 |
+
step=1,
|
462 |
+
label="RAS Window Length",
|
463 |
+
info="Window length for repetition avoidance sampling",
|
464 |
+
)
|
465 |
+
ras_win_max_num_repeat = gr.Slider(
|
466 |
+
minimum=1,
|
467 |
+
maximum=10,
|
468 |
+
value=2,
|
469 |
+
step=1,
|
470 |
+
label="RAS Max Num Repeat",
|
471 |
+
info="Maximum number of repetitions allowed in the window",
|
472 |
+
)
|
473 |
# Add stop strings component
|
474 |
stop_strings = gr.Dataframe(
|
475 |
label="Stop Strings",
|
|
|
529 |
template = PREDEFINED_EXAMPLES[template_name]
|
530 |
# Enable voice preset and custom reference only for voice-clone template
|
531 |
is_voice_clone = template_name == "voice-clone"
|
532 |
+
voice_preset_value = "belinda" if is_voice_clone else "EMPTY"
|
533 |
+
description_text = f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {template["description"]}</p>'
|
534 |
return (
|
535 |
template["system_prompt"], # system_prompt
|
536 |
template["input_text"], # input_text
|
537 |
+
description_text, # template_description
|
538 |
gr.update(
|
539 |
value=voice_preset_value, interactive=is_voice_clone, visible=is_voice_clone
|
540 |
), # voice_preset (value and interactivity)
|
|
|
548 |
gr.update(),
|
549 |
gr.update(),
|
550 |
gr.update(),
|
551 |
+
gr.update(),
|
552 |
) # No change if template not found
|
553 |
|
554 |
# Set up event handlers
|
|
|
560 |
outputs=[
|
561 |
system_prompt,
|
562 |
input_text,
|
563 |
+
template_description,
|
564 |
voice_preset,
|
565 |
custom_reference_accordion,
|
566 |
voice_samples_section,
|
|
|
581 |
top_k,
|
582 |
system_prompt,
|
583 |
stop_strings,
|
584 |
+
ras_win_len,
|
585 |
+
ras_win_max_num_repeat,
|
586 |
],
|
587 |
outputs=[output_text, output_audio],
|
588 |
api_name="generate_speech",
|
higgs_audio/serve/serve_engine.py
CHANGED
@@ -27,6 +27,45 @@ from ..data_collator.higgs_audio_collator import HiggsAudioSampleCollator
|
|
27 |
from ..audio_processing.higgs_audio_tokenizer import load_higgs_audio_tokenizer
|
28 |
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
@dataclass
|
31 |
class HiggsAudioStreamerDelta:
|
32 |
"""Represents a chunk of generated content, either text or audio tokens."""
|
@@ -422,3 +461,14 @@ class HiggsAudioServeEngine:
|
|
422 |
"cached_tokens": 0,
|
423 |
},
|
424 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
from ..audio_processing.higgs_audio_tokenizer import load_higgs_audio_tokenizer
|
28 |
|
29 |
|
30 |
+
def normalize_chinese_punctuation(text):
|
31 |
+
"""
|
32 |
+
Convert Chinese (full-width) punctuation marks to English (half-width) equivalents.
|
33 |
+
"""
|
34 |
+
# Mapping of Chinese punctuation to English punctuation
|
35 |
+
chinese_to_english_punct = {
|
36 |
+
",": ",", # comma
|
37 |
+
"。": ".", # period
|
38 |
+
":": ":", # colon
|
39 |
+
";": ";", # semicolon
|
40 |
+
"?": "?", # question mark
|
41 |
+
"!": "!", # exclamation mark
|
42 |
+
"(": "(", # left parenthesis
|
43 |
+
")": ")", # right parenthesis
|
44 |
+
"【": "[", # left square bracket
|
45 |
+
"】": "]", # right square bracket
|
46 |
+
"《": "<", # left angle quote
|
47 |
+
"》": ">", # right angle quote
|
48 |
+
"“": '"', # left double quotation
|
49 |
+
"”": '"', # right double quotation
|
50 |
+
"‘": "'", # left single quotation
|
51 |
+
"’": "'", # right single quotation
|
52 |
+
"、": ",", # enumeration comma
|
53 |
+
"—": "-", # em dash
|
54 |
+
"…": "...", # ellipsis
|
55 |
+
"·": ".", # middle dot
|
56 |
+
"「": '"', # left corner bracket
|
57 |
+
"」": '"', # right corner bracket
|
58 |
+
"『": '"', # left double corner bracket
|
59 |
+
"』": '"', # right double corner bracket
|
60 |
+
}
|
61 |
+
|
62 |
+
# Replace each Chinese punctuation with its English counterpart
|
63 |
+
for zh_punct, en_punct in chinese_to_english_punct.items():
|
64 |
+
text = text.replace(zh_punct, en_punct)
|
65 |
+
|
66 |
+
return text
|
67 |
+
|
68 |
+
|
69 |
@dataclass
|
70 |
class HiggsAudioStreamerDelta:
|
71 |
"""Represents a chunk of generated content, either text or audio tokens."""
|
|
|
461 |
"cached_tokens": 0,
|
462 |
},
|
463 |
)
|
464 |
+
|
465 |
+
def text_normalize(self, text: str) -> str:
|
466 |
+
"""
|
467 |
+
Normalize the text.
|
468 |
+
"""
|
469 |
+
# Perform some basic normalization
|
470 |
+
text = normalize_chinese_punctuation(text)
|
471 |
+
# Handle parentheses
|
472 |
+
text = text.replace("(", " ")
|
473 |
+
text = text.replace(")", " ")
|
474 |
+
return text
|
voice_examples/{wizard.wav → belinda.wav}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e663310bfe539efac3350fd6b277214dcddd65d5a46949180f11c719c8b9b769
|
3 |
+
size 896776
|
voice_examples/broom_salesman.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c9cb4f37dcac12227045845c07c8aef823519cbf7b62bcbc6223158f9d282e1a
|
3 |
+
size 3383338
|
voice_examples/chadwick.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:166acd9a8d8bf3e205bf8217dfd47f8232437c0ea128c326bd1a9060c099e003
|
3 |
+
size 458796
|
voice_examples/config.json
CHANGED
@@ -1,6 +1,34 @@
|
|
1 |
{
|
2 |
-
"
|
3 |
-
"transcript": "
|
4 |
-
"audio_file": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
}
|
6 |
}
|
|
|
1 |
{
|
2 |
+
"belinda": {
|
3 |
+
"transcript": "Twas the night before my birthday. Hooray! It's almost here! It may not be a holiday, but it's the best day of the year.",
|
4 |
+
"audio_file": "belinda.wav"
|
5 |
+
},
|
6 |
+
"broom_salesman": {
|
7 |
+
"transcript": "I would imagine so. A wand with a dragon heartstring core is capable of dazzling magic. And the bond between you and your wand should only grow stronger. Do not be surprised at your new wand's ability to perceive your intentions - particularly in a moment of need.",
|
8 |
+
"audio_file": "broom_salesman.wav"
|
9 |
+
},
|
10 |
+
"chadwick": {
|
11 |
+
"transcript": "Oh dear, who left all this junk lying around? Whoops, there it goes! Mind your pointed little pink head, starfish man.",
|
12 |
+
"audio_file": "chadwick.wav"
|
13 |
+
},
|
14 |
+
"en_man": {
|
15 |
+
"transcript": "Maintaining your ability to learn translates into increased marketability, improved career options and higher salaries.",
|
16 |
+
"audio_file": "en_man.wav"
|
17 |
+
},
|
18 |
+
"en_woman": {
|
19 |
+
"transcript": "The device would work during the day as well, if you took steps to either block direct sunlight or point it away from the sun.",
|
20 |
+
"audio_file": "en_woman.wav"
|
21 |
+
},
|
22 |
+
"mabel": {
|
23 |
+
"transcript": "You do talk an awful lot about weather, did you know that? Sometimes I wonder if you're actually content to be a wizard or if you're secretly harbouring a desire to become a seer of the clouds.",
|
24 |
+
"audio_file": "mabel.wav"
|
25 |
+
},
|
26 |
+
"vex": {
|
27 |
+
"transcript": "Uhh, this is going to take forever. Why is everything so far?",
|
28 |
+
"audio_file": "vex.wav"
|
29 |
+
},
|
30 |
+
"zh_man_sichuan": {
|
31 |
+
"transcript": "对,这就是我,万人敬仰的太乙真人,虽然有点婴儿肥,但也掩不住我逼人的帅气。",
|
32 |
+
"audio_file": "zh_man_sichuan.wav"
|
33 |
}
|
34 |
}
|
voice_examples/en_man.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1ca3df71ad1b6968765e69870220d34c6b2c2550a499cf59560d9d764d10b94e
|
3 |
+
size 375566
|
voice_examples/en_woman.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e1d49dc69f3b0731ed7b10ddf51dfc8f73465d4323f45841d93583d8b1e4d3e6
|
3 |
+
size 313272
|
voice_examples/mabel.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9e6c5e522c662c5d6b862d8b17e1618546666ce993dcd560f3bdd34a48bacd9f
|
3 |
+
size 1054730
|
voice_examples/vex.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d95c6dcf7265847edd76989ffb2d3f5a92aa3e2bbd3718317010b49842c98954
|
3 |
+
size 523086
|
voice_examples/zh_man_sichuan.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53892ece071342958403bc5643f84169a30b89cc0fc79eb69508bfa11dd85e68
|
3 |
+
size 618528
|