zachzzc commited on
Commit
b4da283
·
1 Parent(s): 493db6d

Add more voice clone voices; Update model names; Update playground

Browse files
.gitattributes CHANGED
@@ -33,4 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- voice_examples/wizard.wav filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ voice_examples/en_woman.wav filter=lfs diff=lfs merge=lfs -text
37
+ voice_examples/mabel.wav filter=lfs diff=lfs merge=lfs -text
38
+ voice_examples/vex.wav filter=lfs diff=lfs merge=lfs -text
39
+ voice_examples/zh_man_sichuan.wav filter=lfs diff=lfs merge=lfs -text
40
+ voice_examples/belinda.wav filter=lfs diff=lfs merge=lfs -text
41
+ voice_examples/broom_salesman.wav filter=lfs diff=lfs merge=lfs -text
42
+ voice_examples/chadwick.wav filter=lfs diff=lfs merge=lfs -text
43
+ voice_examples/en_man.wav filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -25,8 +25,8 @@ from higgs_audio.data_types import ChatMLSample, AudioContent, Message
25
  engine = None
26
 
27
  # Default model configuration
28
- DEFAULT_MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-staging"
29
- DEFAULT_AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer-staging"
30
  SAMPLE_RATE = 24000
31
 
32
  DEFAULT_SYSTEM_PROMPT = (
@@ -43,46 +43,54 @@ PREDEFINED_EXAMPLES = {
43
  "voice-clone": {
44
  "system_prompt": "",
45
  "input_text": "Hey there! I'm your friendly voice twin in the making. Pick a voice preset below or upload your own audio - let's clone some vocals and bring your voice to life! ",
46
- "description": "Voice clone template",
47
  },
48
- "zero-shot": {
49
  "system_prompt": DEFAULT_SYSTEM_PROMPT,
50
- "input_text": "Hey hey! Welcome to Higgs Audio, your voice's new best friend. Drop your text below, and I'll turn it into something that sounds awesome! Let's make some audio magic!",
51
- "description": "Zero-shot template",
52
  },
53
- "multispeaker-interleave": {
54
- "system_prompt": "Generate audio following instruction.\n\n"
 
 
55
  "<|scene_desc_start|>\n"
56
- "SPEAKER0: vocal fry;feminism;slightly fast\n"
57
- "SPEAKER1: masculine;moderate;moderate pitch;monotone;mature\n"
58
- "In this scene, a group of adventurers is debating whether to investigate a potentially dangerous situation.\n"
59
  "<|scene_desc_end|>",
60
- "input_text": "<|generation_instruction_start|>\nGenerate interleaved transcript and audio that lasts for around 10 seconds.\n<|generation_instruction_end|>",
61
- "description": "Multispeaker interleave example",
 
 
 
62
  },
63
- "single-speaker-accent": {
64
  "system_prompt": "Generate audio following instruction.\n\n"
65
  "<|scene_desc_start|>\n"
66
- "SPEAKER0: British accent;\n"
67
  "<|scene_desc_end|>",
68
  "input_text": "Hey, everyone! Welcome back to Tech Talk Tuesdays.\n"
69
  "It's your host, Alex, and today, we're diving into a topic that's become absolutely crucial in the tech world — deep learning.\n"
70
  "And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.\n"
71
  "\n"
72
  "So here's the big question: Do you want to understand how deep learning works?\n",
73
- "description": "Single speaker example",
74
  },
75
  "single-speaker-zh": {
76
  "system_prompt": "Generate audio following instruction.\n\n"
77
  "<|scene_desc_start|>\n"
78
- "\nAudio is recorded from a quiet room.\n"
79
- "\nSPEAKER0: feminine\n"
80
  "<|scene_desc_end|>",
81
  "input_text": "大家好, 欢迎收听本期的跟李沐学AI. 今天沐哥在忙着洗数据, 所以由我, 希格斯主播代替他讲这期视频.\n"
82
  "今天我们要聊的是一个你绝对不能忽视的话题: 多模态学习.\n"
83
  "那么, 问题来了, 你真的了解多模态吗? 你知道如何自己动手构建多模态大模型吗.\n"
84
  "或者说, 你能察觉到我其实是个机器人吗?",
85
- "description": "Single speaker with Chinese text",
 
 
 
 
 
86
  },
87
  }
88
 
@@ -130,6 +138,62 @@ def get_voice_present(voice_preset):
130
  return voice_path, text
131
 
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  @spaces.GPU
134
  def initialize_engine(model_path, audio_tokenizer_path) -> bool:
135
  """Initialize the HiggsAudioServeEngine."""
@@ -200,6 +264,7 @@ def prepare_chatml_sample(
200
  messages.append(Message(role="assistant", content=[audio_content]))
201
 
202
  # Add the main user message
 
203
  messages.append(Message(role="user", content=text))
204
 
205
  return ChatMLSample(messages=messages)
@@ -217,6 +282,8 @@ def text_to_speech(
217
  top_k=50,
218
  system_prompt=DEFAULT_SYSTEM_PROMPT,
219
  stop_strings=None,
 
 
220
  ):
221
  """Convert text to speech using HiggsAudioServeEngine."""
222
  global engine
@@ -237,7 +304,8 @@ def text_to_speech(
237
  request_id = f"tts-playground-{str(uuid.uuid4())}"
238
  logger.info(
239
  f"{request_id}: Generating speech for text: {text[:100]}..., \n"
240
- f"with parameters: temperature={temperature}, top_p={top_p}, top_k={top_k}, stop_list={stop_list}"
 
241
  )
242
  start_time = time.time()
243
 
@@ -249,6 +317,8 @@ def text_to_speech(
249
  top_k=top_k if top_k > 0 else None,
250
  top_p=top_p,
251
  stop_strings=stop_list,
 
 
252
  )
253
 
254
  generation_time = time.time() - start_time
@@ -312,7 +382,7 @@ def create_ui():
312
  }
313
  """
314
 
315
- default_template = "zero-shot"
316
 
317
  """Create the Gradio UI."""
318
  with gr.Blocks(theme=my_theme, css=custom_css) as demo:
@@ -329,6 +399,12 @@ def create_ui():
329
  info="Select a predefined example for system and input messages. Voice preset will be set to EMPTY when a example is selected.",
330
  )
331
 
 
 
 
 
 
 
332
  system_prompt = gr.TextArea(
333
  label="System Prompt",
334
  placeholder="Enter system prompt to guide the model...",
@@ -378,6 +454,22 @@ def create_ui():
378
  )
379
  top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top P")
380
  top_k = gr.Slider(minimum=-1, maximum=100, value=50, step=1, label="Top K")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
381
  # Add stop strings component
382
  stop_strings = gr.Dataframe(
383
  label="Stop Strings",
@@ -437,10 +529,12 @@ def create_ui():
437
  template = PREDEFINED_EXAMPLES[template_name]
438
  # Enable voice preset and custom reference only for voice-clone template
439
  is_voice_clone = template_name == "voice-clone"
440
- voice_preset_value = "wizard" if is_voice_clone else "EMPTY"
 
441
  return (
442
  template["system_prompt"], # system_prompt
443
  template["input_text"], # input_text
 
444
  gr.update(
445
  value=voice_preset_value, interactive=is_voice_clone, visible=is_voice_clone
446
  ), # voice_preset (value and interactivity)
@@ -454,6 +548,7 @@ def create_ui():
454
  gr.update(),
455
  gr.update(),
456
  gr.update(),
 
457
  ) # No change if template not found
458
 
459
  # Set up event handlers
@@ -465,6 +560,7 @@ def create_ui():
465
  outputs=[
466
  system_prompt,
467
  input_text,
 
468
  voice_preset,
469
  custom_reference_accordion,
470
  voice_samples_section,
@@ -485,6 +581,8 @@ def create_ui():
485
  top_k,
486
  system_prompt,
487
  stop_strings,
 
 
488
  ],
489
  outputs=[output_text, output_audio],
490
  api_name="generate_speech",
 
25
  engine = None
26
 
27
  # Default model configuration
28
+ DEFAULT_MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base"
29
+ DEFAULT_AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer"
30
  SAMPLE_RATE = 24000
31
 
32
  DEFAULT_SYSTEM_PROMPT = (
 
43
  "voice-clone": {
44
  "system_prompt": "",
45
  "input_text": "Hey there! I'm your friendly voice twin in the making. Pick a voice preset below or upload your own audio - let's clone some vocals and bring your voice to life! ",
46
+ "description": "Voice clone to clone the reference audio. Leave the system prompt empty.",
47
  },
48
+ "smart-voice": {
49
  "system_prompt": DEFAULT_SYSTEM_PROMPT,
50
+ "input_text": "The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years.",
51
+ "description": "Smart voice to generate speech based on the context",
52
  },
53
+ "multispeaker-voice-description": {
54
+ "system_prompt": "You are an AI assistant designed to convert text into speech.\n"
55
+ "If the user's message includes a [SPEAKER*] tag, do not read out the tag and generate speech for the following text, using the specified voice.\n"
56
+ "If no speaker tag is present, select a suitable voice on your own.\n\n"
57
  "<|scene_desc_start|>\n"
58
+ "SPEAKER0: feminine\n"
59
+ "SPEAKER1: masculine\n"
 
60
  "<|scene_desc_end|>",
61
+ "input_text": "[SPEAKER0] I can't believe you did that without even asking me first!\n"
62
+ "[SPEAKER1] Oh, come on! It wasn't a big deal, and I knew you would overreact like this.\n"
63
+ "[SPEAKER0] Overreact? You made a decision that affects both of us without even considering my opinion!\n"
64
+ "[SPEAKER1] Because I didn't have time to sit around waiting for you to make up your mind! Someone had to act.",
65
+ "description": "Multispeaker with different voice descriptions in the system prompt",
66
  },
67
+ "single-speaker-voice-description": {
68
  "system_prompt": "Generate audio following instruction.\n\n"
69
  "<|scene_desc_start|>\n"
70
+ "SPEAKER0: He speaks with a clear British accent and a conversational, inquisitive tone. His delivery is articulate and at a moderate pace, and very clear audio.\n"
71
  "<|scene_desc_end|>",
72
  "input_text": "Hey, everyone! Welcome back to Tech Talk Tuesdays.\n"
73
  "It's your host, Alex, and today, we're diving into a topic that's become absolutely crucial in the tech world — deep learning.\n"
74
  "And let's be honest, if you've been even remotely connected to tech, AI, or machine learning lately, you know that deep learning is everywhere.\n"
75
  "\n"
76
  "So here's the big question: Do you want to understand how deep learning works?\n",
77
+ "description": "Single speaker with voice description in the system prompt",
78
  },
79
  "single-speaker-zh": {
80
  "system_prompt": "Generate audio following instruction.\n\n"
81
  "<|scene_desc_start|>\n"
82
+ "Audio is recorded from a quiet room.\n"
 
83
  "<|scene_desc_end|>",
84
  "input_text": "大家好, 欢迎收听本期的跟李沐学AI. 今天沐哥在忙着洗数据, 所以由我, 希格斯主播代替他讲这期视频.\n"
85
  "今天我们要聊的是一个你绝对不能忽视的话题: 多模态学习.\n"
86
  "那么, 问题来了, 你真的了解多模态吗? 你知道如何自己动手构建多模态大模型吗.\n"
87
  "或者说, 你能察觉到我其实是个机器人吗?",
88
+ "description": "Single speaker speaking Chinese",
89
+ },
90
+ "single-speaker-bgm": {
91
+ "system_prompt": DEFAULT_SYSTEM_PROMPT,
92
+ "input_text": "<SE_s>[Music]</SE_s> I will remember this, thought Ender, when I am defeated. To keep dignity, and give honor where it's due, so that defeat is not disgrace. And I hope I don't have to do it often. <SE_e>[Music]</SE_e>",
93
+ "description": "Single speaker with BGM using music tag. This is an experimental feature and may need to try multiple times to get the best result.",
94
  },
95
  }
96
 
 
138
  return voice_path, text
139
 
140
 
141
+ def normalize_chinese_punctuation(text):
142
+ """
143
+ Convert Chinese (full-width) punctuation marks to English (half-width) equivalents.
144
+ """
145
+ # Mapping of Chinese punctuation to English punctuation
146
+ chinese_to_english_punct = {
147
+ ",": ", ", # comma
148
+ "。": ".", # period
149
+ ":": ":", # colon
150
+ ";": ";", # semicolon
151
+ "?": "?", # question mark
152
+ "!": "!", # exclamation mark
153
+ "(": "(", # left parenthesis
154
+ ")": ")", # right parenthesis
155
+ "【": "[", # left square bracket
156
+ "】": "]", # right square bracket
157
+ "《": "<", # left angle quote
158
+ "》": ">", # right angle quote
159
+ "“": '"', # left double quotation
160
+ "”": '"', # right double quotation
161
+ "‘": "'", # left single quotation
162
+ "’": "'", # right single quotation
163
+ "、": ",", # enumeration comma
164
+ "—": "-", # em dash
165
+ "…": "...", # ellipsis
166
+ "·": ".", # middle dot
167
+ "「": '"', # left corner bracket
168
+ "」": '"', # right corner bracket
169
+ "『": '"', # left double corner bracket
170
+ "』": '"', # right double corner bracket
171
+ }
172
+
173
+ # Replace each Chinese punctuation with its English counterpart
174
+ for zh_punct, en_punct in chinese_to_english_punct.items():
175
+ text = text.replace(zh_punct, en_punct)
176
+
177
+ return text
178
+
179
+
180
+ def normalize_text(transcript: str):
181
+ transcript = normalize_chinese_punctuation(transcript)
182
+ # Other normalizations (e.g., parentheses and other symbols. Will be improved in the future)
183
+ transcript = transcript.replace("(", " ")
184
+ transcript = transcript.replace(")", " ")
185
+ transcript = transcript.replace("°F", " degrees Fahrenheit")
186
+ transcript = transcript.replace("°C", " degrees Celsius")
187
+ lines = transcript.split("\n")
188
+ transcript = "\n".join([" ".join(line.split()) for line in lines if line.strip()])
189
+ transcript = transcript.strip()
190
+
191
+ if not any([transcript.endswith(c) for c in [".", "!", "?", ",", ";", '"', "'", "</SE_e>", "</SE>"]]):
192
+ transcript += "."
193
+
194
+ return transcript
195
+
196
+
197
  @spaces.GPU
198
  def initialize_engine(model_path, audio_tokenizer_path) -> bool:
199
  """Initialize the HiggsAudioServeEngine."""
 
264
  messages.append(Message(role="assistant", content=[audio_content]))
265
 
266
  # Add the main user message
267
+ text = normalize_text(text)
268
  messages.append(Message(role="user", content=text))
269
 
270
  return ChatMLSample(messages=messages)
 
282
  top_k=50,
283
  system_prompt=DEFAULT_SYSTEM_PROMPT,
284
  stop_strings=None,
285
+ ras_win_len=20,
286
+ ras_win_max_num_repeat=2,
287
  ):
288
  """Convert text to speech using HiggsAudioServeEngine."""
289
  global engine
 
304
  request_id = f"tts-playground-{str(uuid.uuid4())}"
305
  logger.info(
306
  f"{request_id}: Generating speech for text: {text[:100]}..., \n"
307
+ f"with parameters: temperature={temperature}, top_p={top_p}, top_k={top_k}, stop_list={stop_list}, "
308
+ f"ras_win_len={ras_win_len}, ras_win_max_num_repeat={ras_win_max_num_repeat}"
309
  )
310
  start_time = time.time()
311
 
 
317
  top_k=top_k if top_k > 0 else None,
318
  top_p=top_p,
319
  stop_strings=stop_list,
320
+ ras_win_len=ras_win_len if ras_win_len > 0 else None,
321
+ ras_win_max_num_repeat=max(ras_win_len, ras_win_max_num_repeat),
322
  )
323
 
324
  generation_time = time.time() - start_time
 
382
  }
383
  """
384
 
385
+ default_template = "smart-voice"
386
 
387
  """Create the Gradio UI."""
388
  with gr.Blocks(theme=my_theme, css=custom_css) as demo:
 
399
  info="Select a predefined example for system and input messages. Voice preset will be set to EMPTY when a example is selected.",
400
  )
401
 
402
+ # Template description display
403
+ template_description = gr.HTML(
404
+ value=f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {PREDEFINED_EXAMPLES[default_template]["description"]}</p>',
405
+ visible=True,
406
+ )
407
+
408
  system_prompt = gr.TextArea(
409
  label="System Prompt",
410
  placeholder="Enter system prompt to guide the model...",
 
454
  )
455
  top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top P")
456
  top_k = gr.Slider(minimum=-1, maximum=100, value=50, step=1, label="Top K")
457
+ ras_win_len = gr.Slider(
458
+ minimum=0,
459
+ maximum=10,
460
+ value=0,
461
+ step=1,
462
+ label="RAS Window Length",
463
+ info="Window length for repetition avoidance sampling",
464
+ )
465
+ ras_win_max_num_repeat = gr.Slider(
466
+ minimum=1,
467
+ maximum=10,
468
+ value=2,
469
+ step=1,
470
+ label="RAS Max Num Repeat",
471
+ info="Maximum number of repetitions allowed in the window",
472
+ )
473
  # Add stop strings component
474
  stop_strings = gr.Dataframe(
475
  label="Stop Strings",
 
529
  template = PREDEFINED_EXAMPLES[template_name]
530
  # Enable voice preset and custom reference only for voice-clone template
531
  is_voice_clone = template_name == "voice-clone"
532
+ voice_preset_value = "belinda" if is_voice_clone else "EMPTY"
533
+ description_text = f'<p style="font-size: 0.85em; color: var(--body-text-color-subdued); margin: 0; padding: 0;"> {template["description"]}</p>'
534
  return (
535
  template["system_prompt"], # system_prompt
536
  template["input_text"], # input_text
537
+ description_text, # template_description
538
  gr.update(
539
  value=voice_preset_value, interactive=is_voice_clone, visible=is_voice_clone
540
  ), # voice_preset (value and interactivity)
 
548
  gr.update(),
549
  gr.update(),
550
  gr.update(),
551
+ gr.update(),
552
  ) # No change if template not found
553
 
554
  # Set up event handlers
 
560
  outputs=[
561
  system_prompt,
562
  input_text,
563
+ template_description,
564
  voice_preset,
565
  custom_reference_accordion,
566
  voice_samples_section,
 
581
  top_k,
582
  system_prompt,
583
  stop_strings,
584
+ ras_win_len,
585
+ ras_win_max_num_repeat,
586
  ],
587
  outputs=[output_text, output_audio],
588
  api_name="generate_speech",
higgs_audio/serve/serve_engine.py CHANGED
@@ -27,6 +27,45 @@ from ..data_collator.higgs_audio_collator import HiggsAudioSampleCollator
27
  from ..audio_processing.higgs_audio_tokenizer import load_higgs_audio_tokenizer
28
 
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  @dataclass
31
  class HiggsAudioStreamerDelta:
32
  """Represents a chunk of generated content, either text or audio tokens."""
@@ -422,3 +461,14 @@ class HiggsAudioServeEngine:
422
  "cached_tokens": 0,
423
  },
424
  )
 
 
 
 
 
 
 
 
 
 
 
 
27
  from ..audio_processing.higgs_audio_tokenizer import load_higgs_audio_tokenizer
28
 
29
 
30
+ def normalize_chinese_punctuation(text):
31
+ """
32
+ Convert Chinese (full-width) punctuation marks to English (half-width) equivalents.
33
+ """
34
+ # Mapping of Chinese punctuation to English punctuation
35
+ chinese_to_english_punct = {
36
+ ",": ",", # comma
37
+ "。": ".", # period
38
+ ":": ":", # colon
39
+ ";": ";", # semicolon
40
+ "?": "?", # question mark
41
+ "!": "!", # exclamation mark
42
+ "(": "(", # left parenthesis
43
+ ")": ")", # right parenthesis
44
+ "【": "[", # left square bracket
45
+ "】": "]", # right square bracket
46
+ "《": "<", # left angle quote
47
+ "》": ">", # right angle quote
48
+ "“": '"', # left double quotation
49
+ "”": '"', # right double quotation
50
+ "‘": "'", # left single quotation
51
+ "’": "'", # right single quotation
52
+ "、": ",", # enumeration comma
53
+ "—": "-", # em dash
54
+ "…": "...", # ellipsis
55
+ "·": ".", # middle dot
56
+ "「": '"', # left corner bracket
57
+ "」": '"', # right corner bracket
58
+ "『": '"', # left double corner bracket
59
+ "』": '"', # right double corner bracket
60
+ }
61
+
62
+ # Replace each Chinese punctuation with its English counterpart
63
+ for zh_punct, en_punct in chinese_to_english_punct.items():
64
+ text = text.replace(zh_punct, en_punct)
65
+
66
+ return text
67
+
68
+
69
  @dataclass
70
  class HiggsAudioStreamerDelta:
71
  """Represents a chunk of generated content, either text or audio tokens."""
 
461
  "cached_tokens": 0,
462
  },
463
  )
464
+
465
+ def text_normalize(self, text: str) -> str:
466
+ """
467
+ Normalize the text.
468
+ """
469
+ # Perform some basic normalization
470
+ text = normalize_chinese_punctuation(text)
471
+ # Handle parentheses
472
+ text = text.replace("(", " ")
473
+ text = text.replace(")", " ")
474
+ return text
voice_examples/{wizard.wav → belinda.wav} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:83bda9cd63be92366ef40dbe15c33e67b78766fb7069609f10dfc05cc626deba
3
- size 1246508
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e663310bfe539efac3350fd6b277214dcddd65d5a46949180f11c719c8b9b769
3
+ size 896776
voice_examples/broom_salesman.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9cb4f37dcac12227045845c07c8aef823519cbf7b62bcbc6223158f9d282e1a
3
+ size 3383338
voice_examples/chadwick.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:166acd9a8d8bf3e205bf8217dfd47f8232437c0ea128c326bd1a9060c099e003
3
+ size 458796
voice_examples/config.json CHANGED
@@ -1,6 +1,34 @@
1
  {
2
- "wizard": {
3
- "transcript": "I would imagine so. A wand with a dragon heartstring core is capable of dazzling magic.",
4
- "audio_file": "wizard.wav"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  }
6
  }
 
1
  {
2
+ "belinda": {
3
+ "transcript": "Twas the night before my birthday. Hooray! It's almost here! It may not be a holiday, but it's the best day of the year.",
4
+ "audio_file": "belinda.wav"
5
+ },
6
+ "broom_salesman": {
7
+ "transcript": "I would imagine so. A wand with a dragon heartstring core is capable of dazzling magic. And the bond between you and your wand should only grow stronger. Do not be surprised at your new wand's ability to perceive your intentions - particularly in a moment of need.",
8
+ "audio_file": "broom_salesman.wav"
9
+ },
10
+ "chadwick": {
11
+ "transcript": "Oh dear, who left all this junk lying around? Whoops, there it goes! Mind your pointed little pink head, starfish man.",
12
+ "audio_file": "chadwick.wav"
13
+ },
14
+ "en_man": {
15
+ "transcript": "Maintaining your ability to learn translates into increased marketability, improved career options and higher salaries.",
16
+ "audio_file": "en_man.wav"
17
+ },
18
+ "en_woman": {
19
+ "transcript": "The device would work during the day as well, if you took steps to either block direct sunlight or point it away from the sun.",
20
+ "audio_file": "en_woman.wav"
21
+ },
22
+ "mabel": {
23
+ "transcript": "You do talk an awful lot about weather, did you know that? Sometimes I wonder if you're actually content to be a wizard or if you're secretly harbouring a desire to become a seer of the clouds.",
24
+ "audio_file": "mabel.wav"
25
+ },
26
+ "vex": {
27
+ "transcript": "Uhh, this is going to take forever. Why is everything so far?",
28
+ "audio_file": "vex.wav"
29
+ },
30
+ "zh_man_sichuan": {
31
+ "transcript": "对,这就是我,万人敬仰的太乙真人,虽然有点婴儿肥,但也掩不住我逼人的帅气。",
32
+ "audio_file": "zh_man_sichuan.wav"
33
  }
34
  }
voice_examples/en_man.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ca3df71ad1b6968765e69870220d34c6b2c2550a499cf59560d9d764d10b94e
3
+ size 375566
voice_examples/en_woman.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e1d49dc69f3b0731ed7b10ddf51dfc8f73465d4323f45841d93583d8b1e4d3e6
3
+ size 313272
voice_examples/mabel.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e6c5e522c662c5d6b862d8b17e1618546666ce993dcd560f3bdd34a48bacd9f
3
+ size 1054730
voice_examples/vex.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d95c6dcf7265847edd76989ffb2d3f5a92aa3e2bbd3718317010b49842c98954
3
+ size 523086
voice_examples/zh_man_sichuan.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53892ece071342958403bc5643f84169a30b89cc0fc79eb69508bfa11dd85e68
3
+ size 618528