Files changed (2) hide show
  1. app.py +5 -73
  2. higgs_audio/serve/serve_engine.py +2 -2
app.py CHANGED
@@ -89,7 +89,7 @@ PREDEFINED_EXAMPLES = {
89
  },
90
  "single-speaker-bgm": {
91
  "system_prompt": DEFAULT_SYSTEM_PROMPT,
92
- "input_text": "[music start] I will remember this, thought Ender, when I am defeated. To keep dignity, and give honor where it's due, so that defeat is not disgrace. And I hope I don't have to do it often. [music end]",
93
  "description": "Single speaker with BGM using music tag. This is an experimental feature and you may need to try multiple times to get the best result.",
94
  },
95
  }
@@ -184,22 +184,6 @@ def normalize_text(transcript: str):
184
  transcript = transcript.replace(")", " ")
185
  transcript = transcript.replace("°F", " degrees Fahrenheit")
186
  transcript = transcript.replace("°C", " degrees Celsius")
187
-
188
- for tag, replacement in [
189
- ("[laugh]", "<SE>[Laughter]</SE>"),
190
- ("[humming start]", "<SE>[Humming]</SE>"),
191
- ("[humming end]", "<SE_e>[Humming]</SE_e>"),
192
- ("[music start]", "<SE_s>[Music]</SE_s>"),
193
- ("[music end]", "<SE_e>[Music]</SE_e>"),
194
- ("[music]", "<SE>[Music]</SE>"),
195
- ("[sing start]", "<SE_s>[Singing]</SE_s>"),
196
- ("[sing end]", "<SE_e>[Singing]</SE_e>"),
197
- ("[applause]", "<SE>[Applause]</SE>"),
198
- ("[cheering]", "<SE>[Cheering]</SE>"),
199
- ("[cough]", "<SE>[Cough]</SE>"),
200
- ]:
201
- transcript = transcript.replace(tag, replacement)
202
-
203
  lines = transcript.split("\n")
204
  transcript = "\n".join([" ".join(line.split()) for line in lines if line.strip()])
205
  transcript = transcript.strip()
@@ -212,16 +196,7 @@ def normalize_text(transcript: str):
212
 
213
  @spaces.GPU
214
  def initialize_engine(model_path, audio_tokenizer_path) -> bool:
215
- """
216
- Initialize the HiggsAudioServeEngine with the specified model and tokenizer.
217
-
218
- Args:
219
- model_path: Path to the model to load
220
- audio_tokenizer_path: Path to the audio tokenizer to load
221
-
222
- Returns:
223
- True if initialization was successful, False otherwise
224
- """
225
  global engine
226
  try:
227
  logger.info(f"Initializing engine with model: {model_path} and audio tokenizer: {audio_tokenizer_path}")
@@ -310,26 +285,7 @@ def text_to_speech(
310
  ras_win_len=7,
311
  ras_win_max_num_repeat=2,
312
  ):
313
- """
314
- Convert text to speech using HiggsAudioServeEngine.
315
-
316
- Args:
317
- text: The text to convert to speech
318
- voice_preset: The voice preset to use (or "EMPTY" for no preset)
319
- reference_audio: Optional path to reference audio file
320
- reference_text: Optional transcript of the reference audio
321
- max_completion_tokens: Maximum number of tokens to generate
322
- temperature: Sampling temperature for generation
323
- top_p: Top-p sampling parameter
324
- top_k: Top-k sampling parameter
325
- system_prompt: System prompt to guide the model
326
- stop_strings: Dataframe containing stop strings
327
- ras_win_len: Window length for repetition avoidance sampling
328
- ras_win_max_num_repeat: Maximum number of repetitions allowed in the window
329
-
330
- Returns:
331
- Tuple of (generated_text, (sample_rate, audio_data)) where audio_data is int16 numpy array
332
- """
333
  global engine
334
 
335
  if engine is None:
@@ -546,15 +502,6 @@ def create_ui():
546
 
547
  # Function to play voice sample when clicking on a row
548
  def play_voice_sample(evt: gr.SelectData):
549
- """
550
- Play a voice sample when a row is clicked in the voice samples table.
551
-
552
- Args:
553
- evt: The select event containing the clicked row index
554
-
555
- Returns:
556
- Path to the voice sample audio file, or None if not found
557
- """
558
  try:
559
  # Get the preset name from the clicked row
560
  preset_names = [preset for preset in VOICE_PRESETS.keys() if preset != "EMPTY"]
@@ -578,23 +525,11 @@ def create_ui():
578
 
579
  # Function to handle template selection
580
  def apply_template(template_name):
581
- """
582
- Apply a predefined template to the UI components.
583
-
584
- Args:
585
- template_name: Name of the template to apply
586
-
587
- Returns:
588
- Tuple of updated values for system_prompt, input_text, template_description,
589
- voice_preset, custom_reference_accordion, voice_samples_section, and ras_win_len
590
- """
591
  if template_name in PREDEFINED_EXAMPLES:
592
  template = PREDEFINED_EXAMPLES[template_name]
593
  # Enable voice preset and custom reference only for voice-clone template
594
  is_voice_clone = template_name == "voice-clone"
595
  voice_preset_value = "belinda" if is_voice_clone else "EMPTY"
596
- # Set ras_win_len to 0 for single-speaker-bgm, 7 for others
597
- ras_win_len_value = 0 if template_name == "single-speaker-bgm" else 7
598
  description_text = f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {template["description"]}</p>'
599
  return (
600
  template["system_prompt"], # system_prompt
@@ -605,7 +540,6 @@ def create_ui():
605
  ), # voice_preset (value and interactivity)
606
  gr.update(visible=is_voice_clone), # custom reference accordion visibility
607
  gr.update(visible=is_voice_clone), # voice samples section visibility
608
- ras_win_len_value, # ras_win_len
609
  )
610
  else:
611
  return (
@@ -615,7 +549,6 @@ def create_ui():
615
  gr.update(),
616
  gr.update(),
617
  gr.update(),
618
- gr.update(),
619
  ) # No change if template not found
620
 
621
  # Set up event handlers
@@ -631,7 +564,6 @@ def create_ui():
631
  voice_preset,
632
  custom_reference_accordion,
633
  voice_samples_section,
634
- ras_win_len,
635
  ],
636
  )
637
 
@@ -689,8 +621,8 @@ def main():
689
 
690
  # Create and launch the UI
691
  demo = create_ui()
692
- demo.launch(server_name=args.host, server_port=args.port, mcp_server=True)
693
 
694
 
695
  if __name__ == "__main__":
696
- main()
 
89
  },
90
  "single-speaker-bgm": {
91
  "system_prompt": DEFAULT_SYSTEM_PROMPT,
92
+ "input_text": "<SE_s>[Music]</SE_s> I will remember this, thought Ender, when I am defeated. To keep dignity, and give honor where it's due, so that defeat is not disgrace. And I hope I don't have to do it often. <SE_e>[Music]</SE_e>",
93
  "description": "Single speaker with BGM using music tag. This is an experimental feature and you may need to try multiple times to get the best result.",
94
  },
95
  }
 
184
  transcript = transcript.replace(")", " ")
185
  transcript = transcript.replace("°F", " degrees Fahrenheit")
186
  transcript = transcript.replace("°C", " degrees Celsius")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  lines = transcript.split("\n")
188
  transcript = "\n".join([" ".join(line.split()) for line in lines if line.strip()])
189
  transcript = transcript.strip()
 
196
 
197
  @spaces.GPU
198
  def initialize_engine(model_path, audio_tokenizer_path) -> bool:
199
+ """Initialize the HiggsAudioServeEngine."""
 
 
 
 
 
 
 
 
 
200
  global engine
201
  try:
202
  logger.info(f"Initializing engine with model: {model_path} and audio tokenizer: {audio_tokenizer_path}")
 
285
  ras_win_len=7,
286
  ras_win_max_num_repeat=2,
287
  ):
288
+ """Convert text to speech using HiggsAudioServeEngine."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  global engine
290
 
291
  if engine is None:
 
502
 
503
  # Function to play voice sample when clicking on a row
504
  def play_voice_sample(evt: gr.SelectData):
 
 
 
 
 
 
 
 
 
505
  try:
506
  # Get the preset name from the clicked row
507
  preset_names = [preset for preset in VOICE_PRESETS.keys() if preset != "EMPTY"]
 
525
 
526
  # Function to handle template selection
527
  def apply_template(template_name):
 
 
 
 
 
 
 
 
 
 
528
  if template_name in PREDEFINED_EXAMPLES:
529
  template = PREDEFINED_EXAMPLES[template_name]
530
  # Enable voice preset and custom reference only for voice-clone template
531
  is_voice_clone = template_name == "voice-clone"
532
  voice_preset_value = "belinda" if is_voice_clone else "EMPTY"
 
 
533
  description_text = f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {template["description"]}</p>'
534
  return (
535
  template["system_prompt"], # system_prompt
 
540
  ), # voice_preset (value and interactivity)
541
  gr.update(visible=is_voice_clone), # custom reference accordion visibility
542
  gr.update(visible=is_voice_clone), # voice samples section visibility
 
543
  )
544
  else:
545
  return (
 
549
  gr.update(),
550
  gr.update(),
551
  gr.update(),
 
552
  ) # No change if template not found
553
 
554
  # Set up event handlers
 
564
  voice_preset,
565
  custom_reference_accordion,
566
  voice_samples_section,
 
567
  ],
568
  )
569
 
 
621
 
622
  # Create and launch the UI
623
  demo = create_ui()
624
+ demo.launch(server_name=args.host, server_port=args.port)
625
 
626
 
627
  if __name__ == "__main__":
628
+ main()
higgs_audio/serve/serve_engine.py CHANGED
@@ -3,7 +3,7 @@ import base64
3
  import torch
4
  import numpy as np
5
  from io import BytesIO
6
- from dataclasses import dataclass, field
7
  from typing import List, Optional, Union
8
  from copy import deepcopy
9
  from transformers import AutoTokenizer, AutoProcessor
@@ -215,7 +215,7 @@ class HiggsAudioResponse:
215
  generated_audio_tokens: Optional[np.ndarray] = None
216
  sampling_rate: Optional[int] = None
217
  generated_text: str = ""
218
- generated_text_tokens: np.ndarray = field(default_factory=np.ndarray)
219
  usage: Optional[dict] = None
220
 
221
 
 
3
  import torch
4
  import numpy as np
5
  from io import BytesIO
6
+ from dataclasses import dataclass
7
  from typing import List, Optional, Union
8
  from copy import deepcopy
9
  from transformers import AutoTokenizer, AutoProcessor
 
215
  generated_audio_tokens: Optional[np.ndarray] = None
216
  sampling_rate: Optional[int] = None
217
  generated_text: str = ""
218
+ generated_text_tokens: np.ndarray = np.array([])
219
  usage: Optional[dict] = None
220
 
221