Spaces:
Running
on
Zero
Running
on
Zero
Fix example
#1
by
zachzzc
- opened
- app.py +5 -73
- higgs_audio/serve/serve_engine.py +2 -2
app.py
CHANGED
@@ -89,7 +89,7 @@ PREDEFINED_EXAMPLES = {
|
|
89 |
},
|
90 |
"single-speaker-bgm": {
|
91 |
"system_prompt": DEFAULT_SYSTEM_PROMPT,
|
92 |
-
"input_text": "[
|
93 |
"description": "Single speaker with BGM using music tag. This is an experimental feature and you may need to try multiple times to get the best result.",
|
94 |
},
|
95 |
}
|
@@ -184,22 +184,6 @@ def normalize_text(transcript: str):
|
|
184 |
transcript = transcript.replace(")", " ")
|
185 |
transcript = transcript.replace("°F", " degrees Fahrenheit")
|
186 |
transcript = transcript.replace("°C", " degrees Celsius")
|
187 |
-
|
188 |
-
for tag, replacement in [
|
189 |
-
("[laugh]", "<SE>[Laughter]</SE>"),
|
190 |
-
("[humming start]", "<SE>[Humming]</SE>"),
|
191 |
-
("[humming end]", "<SE_e>[Humming]</SE_e>"),
|
192 |
-
("[music start]", "<SE_s>[Music]</SE_s>"),
|
193 |
-
("[music end]", "<SE_e>[Music]</SE_e>"),
|
194 |
-
("[music]", "<SE>[Music]</SE>"),
|
195 |
-
("[sing start]", "<SE_s>[Singing]</SE_s>"),
|
196 |
-
("[sing end]", "<SE_e>[Singing]</SE_e>"),
|
197 |
-
("[applause]", "<SE>[Applause]</SE>"),
|
198 |
-
("[cheering]", "<SE>[Cheering]</SE>"),
|
199 |
-
("[cough]", "<SE>[Cough]</SE>"),
|
200 |
-
]:
|
201 |
-
transcript = transcript.replace(tag, replacement)
|
202 |
-
|
203 |
lines = transcript.split("\n")
|
204 |
transcript = "\n".join([" ".join(line.split()) for line in lines if line.strip()])
|
205 |
transcript = transcript.strip()
|
@@ -212,16 +196,7 @@ def normalize_text(transcript: str):
|
|
212 |
|
213 |
@spaces.GPU
|
214 |
def initialize_engine(model_path, audio_tokenizer_path) -> bool:
|
215 |
-
"""
|
216 |
-
Initialize the HiggsAudioServeEngine with the specified model and tokenizer.
|
217 |
-
|
218 |
-
Args:
|
219 |
-
model_path: Path to the model to load
|
220 |
-
audio_tokenizer_path: Path to the audio tokenizer to load
|
221 |
-
|
222 |
-
Returns:
|
223 |
-
True if initialization was successful, False otherwise
|
224 |
-
"""
|
225 |
global engine
|
226 |
try:
|
227 |
logger.info(f"Initializing engine with model: {model_path} and audio tokenizer: {audio_tokenizer_path}")
|
@@ -310,26 +285,7 @@ def text_to_speech(
|
|
310 |
ras_win_len=7,
|
311 |
ras_win_max_num_repeat=2,
|
312 |
):
|
313 |
-
"""
|
314 |
-
Convert text to speech using HiggsAudioServeEngine.
|
315 |
-
|
316 |
-
Args:
|
317 |
-
text: The text to convert to speech
|
318 |
-
voice_preset: The voice preset to use (or "EMPTY" for no preset)
|
319 |
-
reference_audio: Optional path to reference audio file
|
320 |
-
reference_text: Optional transcript of the reference audio
|
321 |
-
max_completion_tokens: Maximum number of tokens to generate
|
322 |
-
temperature: Sampling temperature for generation
|
323 |
-
top_p: Top-p sampling parameter
|
324 |
-
top_k: Top-k sampling parameter
|
325 |
-
system_prompt: System prompt to guide the model
|
326 |
-
stop_strings: Dataframe containing stop strings
|
327 |
-
ras_win_len: Window length for repetition avoidance sampling
|
328 |
-
ras_win_max_num_repeat: Maximum number of repetitions allowed in the window
|
329 |
-
|
330 |
-
Returns:
|
331 |
-
Tuple of (generated_text, (sample_rate, audio_data)) where audio_data is int16 numpy array
|
332 |
-
"""
|
333 |
global engine
|
334 |
|
335 |
if engine is None:
|
@@ -546,15 +502,6 @@ def create_ui():
|
|
546 |
|
547 |
# Function to play voice sample when clicking on a row
|
548 |
def play_voice_sample(evt: gr.SelectData):
|
549 |
-
"""
|
550 |
-
Play a voice sample when a row is clicked in the voice samples table.
|
551 |
-
|
552 |
-
Args:
|
553 |
-
evt: The select event containing the clicked row index
|
554 |
-
|
555 |
-
Returns:
|
556 |
-
Path to the voice sample audio file, or None if not found
|
557 |
-
"""
|
558 |
try:
|
559 |
# Get the preset name from the clicked row
|
560 |
preset_names = [preset for preset in VOICE_PRESETS.keys() if preset != "EMPTY"]
|
@@ -578,23 +525,11 @@ def create_ui():
|
|
578 |
|
579 |
# Function to handle template selection
|
580 |
def apply_template(template_name):
|
581 |
-
"""
|
582 |
-
Apply a predefined template to the UI components.
|
583 |
-
|
584 |
-
Args:
|
585 |
-
template_name: Name of the template to apply
|
586 |
-
|
587 |
-
Returns:
|
588 |
-
Tuple of updated values for system_prompt, input_text, template_description,
|
589 |
-
voice_preset, custom_reference_accordion, voice_samples_section, and ras_win_len
|
590 |
-
"""
|
591 |
if template_name in PREDEFINED_EXAMPLES:
|
592 |
template = PREDEFINED_EXAMPLES[template_name]
|
593 |
# Enable voice preset and custom reference only for voice-clone template
|
594 |
is_voice_clone = template_name == "voice-clone"
|
595 |
voice_preset_value = "belinda" if is_voice_clone else "EMPTY"
|
596 |
-
# Set ras_win_len to 0 for single-speaker-bgm, 7 for others
|
597 |
-
ras_win_len_value = 0 if template_name == "single-speaker-bgm" else 7
|
598 |
description_text = f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {template["description"]}</p>'
|
599 |
return (
|
600 |
template["system_prompt"], # system_prompt
|
@@ -605,7 +540,6 @@ def create_ui():
|
|
605 |
), # voice_preset (value and interactivity)
|
606 |
gr.update(visible=is_voice_clone), # custom reference accordion visibility
|
607 |
gr.update(visible=is_voice_clone), # voice samples section visibility
|
608 |
-
ras_win_len_value, # ras_win_len
|
609 |
)
|
610 |
else:
|
611 |
return (
|
@@ -615,7 +549,6 @@ def create_ui():
|
|
615 |
gr.update(),
|
616 |
gr.update(),
|
617 |
gr.update(),
|
618 |
-
gr.update(),
|
619 |
) # No change if template not found
|
620 |
|
621 |
# Set up event handlers
|
@@ -631,7 +564,6 @@ def create_ui():
|
|
631 |
voice_preset,
|
632 |
custom_reference_accordion,
|
633 |
voice_samples_section,
|
634 |
-
ras_win_len,
|
635 |
],
|
636 |
)
|
637 |
|
@@ -689,8 +621,8 @@ def main():
|
|
689 |
|
690 |
# Create and launch the UI
|
691 |
demo = create_ui()
|
692 |
-
demo.launch(server_name=args.host, server_port=args.port
|
693 |
|
694 |
|
695 |
if __name__ == "__main__":
|
696 |
-
main()
|
|
|
89 |
},
|
90 |
"single-speaker-bgm": {
|
91 |
"system_prompt": DEFAULT_SYSTEM_PROMPT,
|
92 |
+
"input_text": "<SE_s>[Music]</SE_s> I will remember this, thought Ender, when I am defeated. To keep dignity, and give honor where it's due, so that defeat is not disgrace. And I hope I don't have to do it often. <SE_e>[Music]</SE_e>",
|
93 |
"description": "Single speaker with BGM using music tag. This is an experimental feature and you may need to try multiple times to get the best result.",
|
94 |
},
|
95 |
}
|
|
|
184 |
transcript = transcript.replace(")", " ")
|
185 |
transcript = transcript.replace("°F", " degrees Fahrenheit")
|
186 |
transcript = transcript.replace("°C", " degrees Celsius")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
lines = transcript.split("\n")
|
188 |
transcript = "\n".join([" ".join(line.split()) for line in lines if line.strip()])
|
189 |
transcript = transcript.strip()
|
|
|
196 |
|
197 |
@spaces.GPU
|
198 |
def initialize_engine(model_path, audio_tokenizer_path) -> bool:
|
199 |
+
"""Initialize the HiggsAudioServeEngine."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
global engine
|
201 |
try:
|
202 |
logger.info(f"Initializing engine with model: {model_path} and audio tokenizer: {audio_tokenizer_path}")
|
|
|
285 |
ras_win_len=7,
|
286 |
ras_win_max_num_repeat=2,
|
287 |
):
|
288 |
+
"""Convert text to speech using HiggsAudioServeEngine."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
289 |
global engine
|
290 |
|
291 |
if engine is None:
|
|
|
502 |
|
503 |
# Function to play voice sample when clicking on a row
|
504 |
def play_voice_sample(evt: gr.SelectData):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
505 |
try:
|
506 |
# Get the preset name from the clicked row
|
507 |
preset_names = [preset for preset in VOICE_PRESETS.keys() if preset != "EMPTY"]
|
|
|
525 |
|
526 |
# Function to handle template selection
|
527 |
def apply_template(template_name):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
528 |
if template_name in PREDEFINED_EXAMPLES:
|
529 |
template = PREDEFINED_EXAMPLES[template_name]
|
530 |
# Enable voice preset and custom reference only for voice-clone template
|
531 |
is_voice_clone = template_name == "voice-clone"
|
532 |
voice_preset_value = "belinda" if is_voice_clone else "EMPTY"
|
|
|
|
|
533 |
description_text = f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {template["description"]}</p>'
|
534 |
return (
|
535 |
template["system_prompt"], # system_prompt
|
|
|
540 |
), # voice_preset (value and interactivity)
|
541 |
gr.update(visible=is_voice_clone), # custom reference accordion visibility
|
542 |
gr.update(visible=is_voice_clone), # voice samples section visibility
|
|
|
543 |
)
|
544 |
else:
|
545 |
return (
|
|
|
549 |
gr.update(),
|
550 |
gr.update(),
|
551 |
gr.update(),
|
|
|
552 |
) # No change if template not found
|
553 |
|
554 |
# Set up event handlers
|
|
|
564 |
voice_preset,
|
565 |
custom_reference_accordion,
|
566 |
voice_samples_section,
|
|
|
567 |
],
|
568 |
)
|
569 |
|
|
|
621 |
|
622 |
# Create and launch the UI
|
623 |
demo = create_ui()
|
624 |
+
demo.launch(server_name=args.host, server_port=args.port)
|
625 |
|
626 |
|
627 |
if __name__ == "__main__":
|
628 |
+
main()
|
higgs_audio/serve/serve_engine.py
CHANGED
@@ -3,7 +3,7 @@ import base64
|
|
3 |
import torch
|
4 |
import numpy as np
|
5 |
from io import BytesIO
|
6 |
-
from dataclasses import dataclass
|
7 |
from typing import List, Optional, Union
|
8 |
from copy import deepcopy
|
9 |
from transformers import AutoTokenizer, AutoProcessor
|
@@ -215,7 +215,7 @@ class HiggsAudioResponse:
|
|
215 |
generated_audio_tokens: Optional[np.ndarray] = None
|
216 |
sampling_rate: Optional[int] = None
|
217 |
generated_text: str = ""
|
218 |
-
generated_text_tokens: np.ndarray =
|
219 |
usage: Optional[dict] = None
|
220 |
|
221 |
|
|
|
3 |
import torch
|
4 |
import numpy as np
|
5 |
from io import BytesIO
|
6 |
+
from dataclasses import dataclass
|
7 |
from typing import List, Optional, Union
|
8 |
from copy import deepcopy
|
9 |
from transformers import AutoTokenizer, AutoProcessor
|
|
|
215 |
generated_audio_tokens: Optional[np.ndarray] = None
|
216 |
sampling_rate: Optional[int] = None
|
217 |
generated_text: str = ""
|
218 |
+
generated_text_tokens: np.ndarray = np.array([])
|
219 |
usage: Optional[dict] = None
|
220 |
|
221 |
|