higgs_audio_v2

Running on Zero

App Files Files Community

Fix example

by zachzzc - opened 8 days ago

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

-75

Files changed (2) hide show

app.py +5 -73
higgs_audio/serve/serve_engine.py +2 -2

app.py CHANGED Viewed

@@ -89,7 +89,7 @@ PREDEFINED_EXAMPLES = {
     },
     "single-speaker-bgm": {
         "system_prompt": DEFAULT_SYSTEM_PROMPT,
-        "input_text": "[music start] I will remember this, thought Ender, when I am defeated. To keep dignity, and give honor where it's due, so that defeat is not disgrace. And I hope I don't have to do it often. [music end]",
         "description": "Single speaker with BGM using music tag. This is an experimental feature and you may need to try multiple times to get the best result.",
     },
 }
@@ -184,22 +184,6 @@ def normalize_text(transcript: str):
     transcript = transcript.replace(")", " ")
     transcript = transcript.replace("°F", " degrees Fahrenheit")
     transcript = transcript.replace("°C", " degrees Celsius")
-    for tag, replacement in [
-        ("[laugh]", "<SE>[Laughter]</SE>"),
-        ("[humming start]", "<SE>[Humming]</SE>"),
-        ("[humming end]", "<SE_e>[Humming]</SE_e>"),
-        ("[music start]", "<SE_s>[Music]</SE_s>"),
-        ("[music end]", "<SE_e>[Music]</SE_e>"),
-        ("[music]", "<SE>[Music]</SE>"),
-        ("[sing start]", "<SE_s>[Singing]</SE_s>"),
-        ("[sing end]", "<SE_e>[Singing]</SE_e>"),
-        ("[applause]", "<SE>[Applause]</SE>"),
-        ("[cheering]", "<SE>[Cheering]</SE>"),
-        ("[cough]", "<SE>[Cough]</SE>"),
-    ]:
-        transcript = transcript.replace(tag, replacement)
     lines = transcript.split("\n")
     transcript = "\n".join([" ".join(line.split()) for line in lines if line.strip()])
     transcript = transcript.strip()
@@ -212,16 +196,7 @@ def normalize_text(transcript: str):
 @spaces.GPU
 def initialize_engine(model_path, audio_tokenizer_path) -> bool:
-    """
-    Initialize the HiggsAudioServeEngine with the specified model and tokenizer.
-    Args:
-        model_path: Path to the model to load
-        audio_tokenizer_path: Path to the audio tokenizer to load
-    Returns:
-        True if initialization was successful, False otherwise
-    """
     global engine
     try:
         logger.info(f"Initializing engine with model: {model_path} and audio tokenizer: {audio_tokenizer_path}")
@@ -310,26 +285,7 @@ def text_to_speech(
     ras_win_len=7,
     ras_win_max_num_repeat=2,
 ):
-    """
-    Convert text to speech using HiggsAudioServeEngine.
-    Args:
-        text: The text to convert to speech
-        voice_preset: The voice preset to use (or "EMPTY" for no preset)
-        reference_audio: Optional path to reference audio file
-        reference_text: Optional transcript of the reference audio
-        max_completion_tokens: Maximum number of tokens to generate
-        temperature: Sampling temperature for generation
-        top_p: Top-p sampling parameter
-        top_k: Top-k sampling parameter
-        system_prompt: System prompt to guide the model
-        stop_strings: Dataframe containing stop strings
-        ras_win_len: Window length for repetition avoidance sampling
-        ras_win_max_num_repeat: Maximum number of repetitions allowed in the window
-    Returns:
-        Tuple of (generated_text, (sample_rate, audio_data)) where audio_data is int16 numpy array
-    """
     global engine
     if engine is None:
@@ -546,15 +502,6 @@ def create_ui():
         # Function to play voice sample when clicking on a row
         def play_voice_sample(evt: gr.SelectData):
-            """
-            Play a voice sample when a row is clicked in the voice samples table.
-            Args:
-                evt: The select event containing the clicked row index
-            Returns:
-                Path to the voice sample audio file, or None if not found
-            """
             try:
                 # Get the preset name from the clicked row
                 preset_names = [preset for preset in VOICE_PRESETS.keys() if preset != "EMPTY"]
@@ -578,23 +525,11 @@ def create_ui():
         # Function to handle template selection
         def apply_template(template_name):
-            """
-            Apply a predefined template to the UI components.
-            Args:
-                template_name: Name of the template to apply
-            Returns:
-                Tuple of updated values for system_prompt, input_text, template_description,
-                voice_preset, custom_reference_accordion, voice_samples_section, and ras_win_len
-            """
             if template_name in PREDEFINED_EXAMPLES:
                 template = PREDEFINED_EXAMPLES[template_name]
                 # Enable voice preset and custom reference only for voice-clone template
                 is_voice_clone = template_name == "voice-clone"
                 voice_preset_value = "belinda" if is_voice_clone else "EMPTY"
-                # Set ras_win_len to 0 for single-speaker-bgm, 7 for others
-                ras_win_len_value = 0 if template_name == "single-speaker-bgm" else 7
                 description_text = f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {template["description"]}</p>'
                 return (
                     template["system_prompt"],  # system_prompt
@@ -605,7 +540,6 @@ def create_ui():
                     ),  # voice_preset (value and interactivity)
                     gr.update(visible=is_voice_clone),  # custom reference accordion visibility
                     gr.update(visible=is_voice_clone),  # voice samples section visibility
-                    ras_win_len_value,  # ras_win_len
                 )
             else:
                 return (
@@ -615,7 +549,6 @@ def create_ui():
                     gr.update(),
                     gr.update(),
                     gr.update(),
-                    gr.update(),
                 )  # No change if template not found
         # Set up event handlers
@@ -631,7 +564,6 @@ def create_ui():
                 voice_preset,
                 custom_reference_accordion,
                 voice_samples_section,
-                ras_win_len,
             ],
         )
@@ -689,8 +621,8 @@ def main():
     # Create and launch the UI
     demo = create_ui()
-    demo.launch(server_name=args.host, server_port=args.port, mcp_server=True)
 if __name__ == "__main__":
-    main()

     },
     "single-speaker-bgm": {
         "system_prompt": DEFAULT_SYSTEM_PROMPT,
+        "input_text": "<SE_s>[Music]</SE_s> I will remember this, thought Ender, when I am defeated. To keep dignity, and give honor where it's due, so that defeat is not disgrace. And I hope I don't have to do it often. <SE_e>[Music]</SE_e>",
         "description": "Single speaker with BGM using music tag. This is an experimental feature and you may need to try multiple times to get the best result.",
     },
 }
     transcript = transcript.replace(")", " ")
     transcript = transcript.replace("°F", " degrees Fahrenheit")
     transcript = transcript.replace("°C", " degrees Celsius")
     lines = transcript.split("\n")
     transcript = "\n".join([" ".join(line.split()) for line in lines if line.strip()])
     transcript = transcript.strip()
 @spaces.GPU
 def initialize_engine(model_path, audio_tokenizer_path) -> bool:
+    """Initialize the HiggsAudioServeEngine."""
     global engine
     try:
         logger.info(f"Initializing engine with model: {model_path} and audio tokenizer: {audio_tokenizer_path}")
     ras_win_len=7,
     ras_win_max_num_repeat=2,
 ):
+    """Convert text to speech using HiggsAudioServeEngine."""
     global engine
     if engine is None:
         # Function to play voice sample when clicking on a row
         def play_voice_sample(evt: gr.SelectData):
             try:
                 # Get the preset name from the clicked row
                 preset_names = [preset for preset in VOICE_PRESETS.keys() if preset != "EMPTY"]
         # Function to handle template selection
         def apply_template(template_name):
             if template_name in PREDEFINED_EXAMPLES:
                 template = PREDEFINED_EXAMPLES[template_name]
                 # Enable voice preset and custom reference only for voice-clone template
                 is_voice_clone = template_name == "voice-clone"
                 voice_preset_value = "belinda" if is_voice_clone else "EMPTY"
                 description_text = f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {template["description"]}</p>'
                 return (
                     template["system_prompt"],  # system_prompt
                     ),  # voice_preset (value and interactivity)
                     gr.update(visible=is_voice_clone),  # custom reference accordion visibility
                     gr.update(visible=is_voice_clone),  # voice samples section visibility
                 )
             else:
                 return (
                     gr.update(),
                     gr.update(),
                     gr.update(),
                 )  # No change if template not found
         # Set up event handlers
                 voice_preset,
                 custom_reference_accordion,
                 voice_samples_section,
             ],
         )
     # Create and launch the UI
     demo = create_ui()
+    demo.launch(server_name=args.host, server_port=args.port)
 if __name__ == "__main__":
+    main()

higgs_audio/serve/serve_engine.py CHANGED Viewed

@@ -3,7 +3,7 @@ import base64
 import torch
 import numpy as np
 from io import BytesIO
-from dataclasses import dataclass, field
 from typing import List, Optional, Union
 from copy import deepcopy
 from transformers import AutoTokenizer, AutoProcessor
@@ -215,7 +215,7 @@ class HiggsAudioResponse:
     generated_audio_tokens: Optional[np.ndarray] = None
     sampling_rate: Optional[int] = None
     generated_text: str = ""
-    generated_text_tokens: np.ndarray = field(default_factory=np.ndarray)
     usage: Optional[dict] = None

 import torch
 import numpy as np
 from io import BytesIO
+from dataclasses import dataclass
 from typing import List, Optional, Union
 from copy import deepcopy
 from transformers import AutoTokenizer, AutoProcessor
     generated_audio_tokens: Optional[np.ndarray] = None
     sampling_rate: Optional[int] = None
     generated_text: str = ""
+    generated_text_tokens: np.ndarray = np.array([])
     usage: Optional[dict] = None