zachzzc commited on
Commit
4cbb9c7
·
1 Parent(s): 02be650

Add voice clone example; playground updates

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ voice_examples/wizard.wav filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -24,10 +24,6 @@ from higgs_audio.data_types import ChatMLSample, AudioContent, Message
24
  # Global engine instance
25
  engine = None
26
 
27
- # Set up default paths and resources
28
- EXAMPLES_DIR = os.path.join(os.path.dirname(__file__), "examples")
29
- os.makedirs(EXAMPLES_DIR, exist_ok=True)
30
-
31
  # Default model configuration
32
  DEFAULT_MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-staging"
33
  DEFAULT_AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer-staging"
@@ -44,7 +40,16 @@ DEFAULT_STOP_STRINGS = ["<|end_of_text|>", "<|eot_id|>"]
44
 
45
  # Predefined examples for system and input messages
46
  PREDEFINED_EXAMPLES = {
47
- "None": {"system_prompt": "", "input_text": "", "description": "Default example"},
 
 
 
 
 
 
 
 
 
48
  "multispeaker-interleave": {
49
  "system_prompt": "Generate audio following instruction.\n\n"
50
  "<|scene_desc_start|>\n"
@@ -55,7 +60,7 @@ PREDEFINED_EXAMPLES = {
55
  "input_text": "<|generation_instruction_start|>\nGenerate interleaved transcript and audio that lasts for around 10 seconds.\n<|generation_instruction_end|>",
56
  "description": "Multispeaker interleave example",
57
  },
58
- "single-speaker": {
59
  "system_prompt": "Generate audio following instruction.\n\n"
60
  "<|scene_desc_start|>\n"
61
  "SPEAKER0: british accent\n"
@@ -316,9 +321,9 @@ def create_ui():
316
  with gr.Column(scale=2):
317
  # Template selection dropdown
318
  template_dropdown = gr.Dropdown(
319
- label="Message examples",
320
  choices=list(PREDEFINED_EXAMPLES.keys()),
321
- value="None",
322
  info="Select a predefined example for system and input messages. Voice preset will be set to EMPTY when a example is selected.",
323
  )
324
 
@@ -339,9 +344,13 @@ def create_ui():
339
  label="Voice Preset",
340
  choices=list(VOICE_PRESETS.keys()),
341
  value="EMPTY",
 
 
342
  )
343
 
344
- with gr.Accordion("Custom Reference (Optional)", open=False):
 
 
345
  reference_audio = gr.Audio(label="Reference Audio", type="filepath")
346
  reference_text = gr.TextArea(
347
  label="Reference Text (transcript of the reference audio)",
@@ -423,16 +432,26 @@ def create_ui():
423
  def apply_template(template_name):
424
  if template_name in PREDEFINED_EXAMPLES:
425
  template = PREDEFINED_EXAMPLES[template_name]
 
 
426
  return (
427
  template["system_prompt"], # system_prompt
428
  template["input_text"], # input_text
429
- "EMPTY", # voice_preset (always set to EMPTY for examples)
 
 
 
 
 
430
  )
431
  else:
432
  return (
433
  gr.update(),
434
  gr.update(),
435
  gr.update(),
 
 
 
436
  ) # No change if template not found
437
 
438
  # Set up event handlers
@@ -441,7 +460,14 @@ def create_ui():
441
  template_dropdown.change(
442
  fn=apply_template,
443
  inputs=[template_dropdown],
444
- outputs=[system_prompt, input_text, voice_preset],
 
 
 
 
 
 
 
445
  )
446
 
447
  # Connect submit button to the TTS function
 
24
  # Global engine instance
25
  engine = None
26
 
 
 
 
 
27
  # Default model configuration
28
  DEFAULT_MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-staging"
29
  DEFAULT_AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer-staging"
 
40
 
41
  # Predefined examples for system and input messages
42
  PREDEFINED_EXAMPLES = {
43
+ "voice-clone": {
44
+ "system_prompt": "",
45
+ "input_text": "Hey there! I'm your friendly voice twin in the making. Pick a voice preset below or upload your own audio - let's clone some vocals and bring your voice to life! ",
46
+ "description": "Voice clone template",
47
+ },
48
+ "zero-shot": {
49
+ "system_prompt": DEFAULT_SYSTEM_PROMPT,
50
+ "input_text": "Hey hey! Welcome to Higgs Audio, your voice's new best friend. Drop your text below, and I'll turn it into something that sounds awesome! Let's make some audio magic!",
51
+ "description": "Zero-shot template",
52
+ },
53
  "multispeaker-interleave": {
54
  "system_prompt": "Generate audio following instruction.\n\n"
55
  "<|scene_desc_start|>\n"
 
60
  "input_text": "<|generation_instruction_start|>\nGenerate interleaved transcript and audio that lasts for around 10 seconds.\n<|generation_instruction_end|>",
61
  "description": "Multispeaker interleave example",
62
  },
63
+ "single-speaker-accent": {
64
  "system_prompt": "Generate audio following instruction.\n\n"
65
  "<|scene_desc_start|>\n"
66
  "SPEAKER0: british accent\n"
 
321
  with gr.Column(scale=2):
322
  # Template selection dropdown
323
  template_dropdown = gr.Dropdown(
324
+ label="TTS Template",
325
  choices=list(PREDEFINED_EXAMPLES.keys()),
326
+ value="zero-shot",
327
  info="Select a predefined example for system and input messages. Voice preset will be set to EMPTY when a example is selected.",
328
  )
329
 
 
344
  label="Voice Preset",
345
  choices=list(VOICE_PRESETS.keys()),
346
  value="EMPTY",
347
+ interactive=False, # Disabled by default since default template is not voice-clone
348
+ visible=False,
349
  )
350
 
351
+ with gr.Accordion(
352
+ "Custom Reference (Optional)", open=False, visible=False
353
+ ) as custom_reference_accordion:
354
  reference_audio = gr.Audio(label="Reference Audio", type="filepath")
355
  reference_text = gr.TextArea(
356
  label="Reference Text (transcript of the reference audio)",
 
432
  def apply_template(template_name):
433
  if template_name in PREDEFINED_EXAMPLES:
434
  template = PREDEFINED_EXAMPLES[template_name]
435
+ # Enable voice preset and custom reference only for voice-clone template
436
+ is_voice_clone = template_name == "voice-clone"
437
  return (
438
  template["system_prompt"], # system_prompt
439
  template["input_text"], # input_text
440
+ gr.update(
441
+ value="wizard", interactive=is_voice_clone, visible=is_voice_clone
442
+ ), # voice_preset (value and interactivity)
443
+ gr.update(visible=is_voice_clone), # custom reference accordion visibility
444
+ gr.update(visible=is_voice_clone), # voice samples table visibility
445
+ gr.update(visible=is_voice_clone), # sample audio visibility
446
  )
447
  else:
448
  return (
449
  gr.update(),
450
  gr.update(),
451
  gr.update(),
452
+ gr.update(),
453
+ gr.update(),
454
+ gr.update(),
455
  ) # No change if template not found
456
 
457
  # Set up event handlers
 
460
  template_dropdown.change(
461
  fn=apply_template,
462
  inputs=[template_dropdown],
463
+ outputs=[
464
+ system_prompt,
465
+ input_text,
466
+ voice_preset,
467
+ custom_reference_accordion,
468
+ voice_samples_table,
469
+ sample_audio,
470
+ ],
471
  )
472
 
473
  # Connect submit button to the TTS function
voice_examples/config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "wizard": {
3
+ "transcript": "I would imagine so. A wand with a dragon heartstring core is capable of dazzling magic.",
4
+ "audio_file": "wizard.wav"
5
+ }
6
+ }
voice_examples/wizard.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83bda9cd63be92366ef40dbe15c33e67b78766fb7069609f10dfc05cc626deba
3
+ size 1246508