zach commited on
Commit
5bf19b3
·
1 Parent(s): 701fd0f

Update application flow to accept a character description and normalize a prompt for claude

Browse files
README.md CHANGED
@@ -98,7 +98,7 @@ Expressive TTS Arena/
98
 
99
  ## User Flow
100
 
101
- 1. **Enter or Generate Text:** Type directly in the Text box, or optionally enter a Prompt, click "Generate text", and edit if needed.
102
  2. **Synthesize Speech:** Click "Synthesize speech" to generate two audio outputs.
103
  3. **Listen & Compare:** Playback both options (A & B) to hear the differences.
104
  4. **Vote for Your Favorite:** Click "Vote for option A" or "Vote for option B" to choose your favorite.
 
98
 
99
  ## User Flow
100
 
101
+ 1. **Enter or Generate Text:** Type directly in the Text box, or optionally enter a Character description, click "Generate text", and edit if needed.
102
  2. **Synthesize Speech:** Click "Synthesize speech" to generate two audio outputs.
103
  3. **Listen & Compare:** Playback both options (A & B) to hear the differences.
104
  4. **Vote for Your Favorite:** Click "Vote for option A" or "Vote for option B" to choose your favorite.
src/app.py CHANGED
@@ -3,8 +3,8 @@ app.py
3
 
4
  Gradio UI for interacting with the Anthropic API, Hume TTS API, and ElevenLabs TTS API.
5
 
6
- Users enter a prompt, which is processed using Claude by Anthropic to generate text.
7
- The text is then synthesized into speech using both Hume and ElevenLabs text-to-speech (TTS) APIs.
8
  Users can compare the outputs and vote for their favorite in an interactive UI.
9
  """
10
 
@@ -19,19 +19,7 @@ import gradio as gr
19
 
20
  # Local Application Imports
21
  from src.config import AUDIO_DIR, logger
22
- from src.constants import (
23
- ELEVENLABS,
24
- HUME_AI,
25
- OPTION_A,
26
- OPTION_B,
27
- PROMPT_MAX_LENGTH,
28
- PROMPT_MIN_LENGTH,
29
- SAMPLE_PROMPTS,
30
- TROPHY_EMOJI,
31
- TTS_PROVIDERS,
32
- VOTE_FOR_OPTION_A,
33
- VOTE_FOR_OPTION_B,
34
- )
35
  from src.integrations import (
36
  AnthropicError,
37
  ElevenLabsError,
@@ -41,18 +29,18 @@ from src.integrations import (
41
  text_to_speech_with_hume,
42
  )
43
  from src.theme import CustomTheme
44
- from src.types import OptionMap
45
- from src.utils import validate_prompt_length
46
 
47
 
48
  def generate_text(
49
- prompt: str,
50
  ) -> Tuple[Union[str, gr.update], gr.update]:
51
  """
52
- Validates the prompt and generates text using Anthropic API.
53
 
54
  Args:
55
- prompt (str): The user-provided text prompt.
56
 
57
  Returns:
58
  Tuple containing:
@@ -63,13 +51,13 @@ def generate_text(
63
  gr.Error: On validation or API errors.
64
  """
65
  try:
66
- validate_prompt_length(prompt, PROMPT_MAX_LENGTH, PROMPT_MIN_LENGTH)
67
  except ValueError as ve:
68
  logger.warning(f"Validation error: {ve}")
69
  raise gr.Error(str(ve))
70
 
71
  try:
72
- generated_text = generate_text_with_claude(prompt)
73
  logger.info(f"Generated text ({len(generated_text)} characters).")
74
  return gr.update(value=generated_text), generated_text
75
  except AnthropicError as ae:
@@ -83,7 +71,7 @@ def generate_text(
83
 
84
 
85
  def text_to_speech(
86
- prompt: str, text: str, generated_text_state: str
87
  ) -> Tuple[gr.update, gr.update, dict, Union[str, None]]:
88
  """
89
  Synthesizes two text to speech outputs, loads the two audio players with the
@@ -92,7 +80,7 @@ def text_to_speech(
92
  - 50% chance to synthesize two Hume outputs.
93
 
94
  Args:
95
- prompt (str): The original prompt.
96
  text (str): The text to synthesize to speech.
97
 
98
  Returns:
@@ -110,41 +98,59 @@ def text_to_speech(
110
  raise gr.Error("Please generate or enter text to synthesize.")
111
 
112
  # Hume AI always included in comparison
113
- provider_a = HUME_AI
114
  # If not using generated text, then only compare Hume to Hume
115
- provider_b = (
116
- HUME_AI if text != generated_text_state else random.choice(TTS_PROVIDERS)
 
117
  )
118
 
119
  try:
120
  with ThreadPoolExecutor(max_workers=2) as executor:
121
- future_audio_a = executor.submit(text_to_speech_with_hume, prompt, text)
 
 
122
 
123
  match provider_b:
124
- case ELEVENLABS:
 
125
  future_audio_b = executor.submit(
126
- text_to_speech_with_elevenlabs, prompt, text
127
  )
128
- case HUME_AI:
 
129
  future_audio_b = executor.submit(
130
- text_to_speech_with_hume, prompt, text
131
  )
132
  case _:
133
  raise ValueError(f"Unsupported provider: {provider_b}")
134
 
135
- audio_a = future_audio_a.result()
136
- audio_b = future_audio_b.result()
137
 
138
- options = [(audio_a, provider_a), (audio_b, provider_b)]
 
 
 
139
  random.shuffle(options)
140
- option_a_audio, option_b_audio = options[0][0], options[1][0]
141
- options_map: OptionMap = {OPTION_A: options[0][1], OPTION_B: options[1][1]}
 
 
 
 
142
 
143
  return (
144
  gr.update(value=option_a_audio, visible=True, autoplay=True),
145
  gr.update(value=option_b_audio, visible=True),
146
  options_map,
147
  option_b_audio,
 
 
 
 
 
 
148
  )
149
  except ElevenLabsError as ee:
150
  logger.error(f"ElevenLabsError while synthesizing speech from text: {str(ee)}")
@@ -162,7 +168,15 @@ def text_to_speech(
162
 
163
 
164
  def vote(
165
- vote_submitted: bool, option_map: OptionMap, selected_button: str
 
 
 
 
 
 
 
 
166
  ) -> Tuple[bool, gr.update, gr.update, gr.update]:
167
  """
168
  Handles user voting.
@@ -187,17 +201,35 @@ def vote(
187
  if not option_map or vote_submitted:
188
  return gr.skip(), gr.skip(), gr.skip(), gr.skip()
189
 
190
- option_a_selected = selected_button == VOTE_FOR_OPTION_A
191
  selected_option, other_option = (
192
- (OPTION_A, OPTION_B) if option_a_selected else (OPTION_B, OPTION_A)
 
 
193
  )
194
  selected_provider = option_map.get(selected_option)
195
  other_provider = option_map.get(other_option)
196
 
197
  # Build button labels, displaying the provider and voice name, appending the trophy emoji to the selected option.
198
- selected_label = f"{selected_provider} {TROPHY_EMOJI}"
199
  other_label = f"{other_provider}"
200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  return (
202
  True,
203
  (
@@ -231,8 +263,8 @@ def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None,
231
  return (
232
  gr.update(value=None),
233
  gr.update(value=None, autoplay=False),
234
- gr.update(value=VOTE_FOR_OPTION_A, variant="secondary"),
235
- gr.update(value=VOTE_FOR_OPTION_B, variant="secondary"),
236
  None,
237
  None,
238
  False,
@@ -240,34 +272,34 @@ def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None,
240
 
241
 
242
  def build_input_section() -> Tuple[gr.Markdown, gr.Dropdown, gr.Textbox, gr.Button]:
243
- """Builds the input section including instructions, sample prompt dropdown, prompt input, and generate button"""
244
  instructions = gr.Markdown(
245
  """
246
- 1. **Enter or Generate Text:** Type directly in the text box—or enter a prompt and click “Generate Text” to auto-populate. Edit as needed.
247
  2. **Synthesize Speech:** Click “Synthesize Speech” to generate two audio outputs.
248
  3. **Listen & Compare:** Play back both audio options to hear the differences.
249
  4. **Vote for Your Favorite:** Click “Vote for Option A” or “Vote for Option B” to cast your vote.
250
  """
251
  )
252
- sample_prompt_dropdown = gr.Dropdown(
253
- choices=list(SAMPLE_PROMPTS.keys()),
254
- label="Choose a sample prompt (or enter your own)",
255
  value=None,
256
  interactive=True,
257
  )
258
- prompt_input = gr.Textbox(
259
- label="Prompt",
260
- placeholder="Enter your prompt...",
261
  lines=3,
262
  max_lines=8,
263
- max_length=PROMPT_MAX_LENGTH,
264
  show_copy_button=True,
265
  )
266
  generate_text_button = gr.Button("Generate text", variant="secondary")
267
  return (
268
  instructions,
269
- sample_prompt_dropdown,
270
- prompt_input,
271
  generate_text_button,
272
  )
273
 
@@ -283,20 +315,20 @@ def build_output_section() -> (
283
  autoscroll=False,
284
  lines=3,
285
  max_lines=8,
286
- max_length=PROMPT_MAX_LENGTH,
287
  show_copy_button=True,
288
  )
289
  synthesize_speech_button = gr.Button("Synthesize speech", variant="primary")
290
  with gr.Row(equal_height=True):
291
  option_a_audio_player = gr.Audio(
292
- label=OPTION_A, type="filepath", interactive=False
293
  )
294
  option_b_audio_player = gr.Audio(
295
- label=OPTION_B, type="filepath", interactive=False
296
  )
297
  with gr.Row(equal_height=True):
298
- vote_button_a = gr.Button(VOTE_FOR_OPTION_A, interactive=False)
299
- vote_button_b = gr.Button(VOTE_FOR_OPTION_B, interactive=False)
300
  return (
301
  text_input,
302
  synthesize_speech_button,
@@ -325,9 +357,12 @@ def build_gradio_interface() -> gr.Blocks:
325
  gr.Markdown("# Expressive TTS Arena")
326
 
327
  # Build generate text section
328
- (instructions, sample_prompt_dropdown, prompt_input, generate_text_button) = (
329
- build_input_section()
330
- )
 
 
 
331
 
332
  # Build synthesize speech section
333
  (
@@ -341,6 +376,18 @@ def build_gradio_interface() -> gr.Blocks:
341
 
342
  # --- UI state components ---
343
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  # Track generated text state
345
  generated_text_state = gr.State("")
346
  # Track generated audio for option B for playing automatically after option 1 audio finishes
@@ -352,11 +399,11 @@ def build_gradio_interface() -> gr.Blocks:
352
 
353
  # --- Register event handlers ---
354
 
355
- # When a sample prompt is chosen, update the prompt textbox
356
- sample_prompt_dropdown.change(
357
- fn=lambda choice: SAMPLE_PROMPTS.get(choice, ""),
358
- inputs=[sample_prompt_dropdown],
359
- outputs=[prompt_input],
360
  )
361
 
362
  # Generate text button click handler chain:
@@ -369,7 +416,7 @@ def build_gradio_interface() -> gr.Blocks:
369
  outputs=[generate_text_button],
370
  ).then(
371
  fn=generate_text,
372
- inputs=[prompt_input],
373
  outputs=[text_input, generated_text_state],
374
  ).then(
375
  fn=lambda: gr.update(interactive=True),
@@ -404,12 +451,18 @@ def build_gradio_interface() -> gr.Blocks:
404
  ],
405
  ).then(
406
  fn=text_to_speech,
407
- inputs=[prompt_input, text_input, generated_text_state],
408
  outputs=[
409
  option_a_audio_player,
410
  option_b_audio_player,
411
  option_map_state,
412
  option_b_audio_state,
 
 
 
 
 
 
413
  ],
414
  ).then(
415
  fn=lambda: (
@@ -430,6 +483,12 @@ def build_gradio_interface() -> gr.Blocks:
430
  vote_button_a,
431
  vote_button_b,
432
  synthesize_speech_button,
 
 
 
 
 
 
433
  ],
434
  )
435
  vote_button_b.click(
 
3
 
4
  Gradio UI for interacting with the Anthropic API, Hume TTS API, and ElevenLabs TTS API.
5
 
6
+ Users enter a character description, which is processed using Claude by Anthropic to generate text.
7
+ The text is then synthesized into speech using different TTS provider APIs.
8
  Users can compare the outputs and vote for their favorite in an interactive UI.
9
  """
10
 
 
19
 
20
  # Local Application Imports
21
  from src.config import AUDIO_DIR, logger
22
+ from src import constants
 
 
 
 
 
 
 
 
 
 
 
 
23
  from src.integrations import (
24
  AnthropicError,
25
  ElevenLabsError,
 
29
  text_to_speech_with_hume,
30
  )
31
  from src.theme import CustomTheme
32
+ from src.types import ComparisonType, OptionMap, VotingResults
33
+ from src.utils import validate_character_description_length
34
 
35
 
36
  def generate_text(
37
+ character_description: str,
38
  ) -> Tuple[Union[str, gr.update], gr.update]:
39
  """
40
+ Validates the character_description and generates text using Anthropic API.
41
 
42
  Args:
43
+ character_description (str): The user-provided text for character description.
44
 
45
  Returns:
46
  Tuple containing:
 
51
  gr.Error: On validation or API errors.
52
  """
53
  try:
54
+ validate_character_description_length(character_description)
55
  except ValueError as ve:
56
  logger.warning(f"Validation error: {ve}")
57
  raise gr.Error(str(ve))
58
 
59
  try:
60
+ generated_text = generate_text_with_claude(character_description)
61
  logger.info(f"Generated text ({len(generated_text)} characters).")
62
  return gr.update(value=generated_text), generated_text
63
  except AnthropicError as ae:
 
71
 
72
 
73
  def text_to_speech(
74
+ character_description: str, text: str, generated_text_state: str
75
  ) -> Tuple[gr.update, gr.update, dict, Union[str, None]]:
76
  """
77
  Synthesizes two text to speech outputs, loads the two audio players with the
 
80
  - 50% chance to synthesize two Hume outputs.
81
 
82
  Args:
83
+ character_description (str): The original character_description.
84
  text (str): The text to synthesize to speech.
85
 
86
  Returns:
 
98
  raise gr.Error("Please generate or enter text to synthesize.")
99
 
100
  # Hume AI always included in comparison
101
+ provider_a = constants.HUME_AI
102
  # If not using generated text, then only compare Hume to Hume
103
+ text_modified = text != generated_text_state
104
+ provider_b: constants.TTSProviderName = (
105
+ constants.HUME_AI if text_modified else random.choice(constants.TTS_PROVIDERS)
106
  )
107
 
108
  try:
109
  with ThreadPoolExecutor(max_workers=2) as executor:
110
+ future_audio_a = executor.submit(
111
+ text_to_speech_with_hume, character_description, text
112
+ )
113
 
114
  match provider_b:
115
+ case constants.HUME_AI:
116
+ comparison_type: ComparisonType = constants.HUME_TO_HUME
117
  future_audio_b = executor.submit(
118
+ text_to_speech_with_hume, character_description, text
119
  )
120
+ case constants.ELEVENLABS:
121
+ comparison_type: ComparisonType = constants.HUME_TO_ELEVENLABS
122
  future_audio_b = executor.submit(
123
+ text_to_speech_with_elevenlabs, character_description, text
124
  )
125
  case _:
126
  raise ValueError(f"Unsupported provider: {provider_b}")
127
 
128
+ generation_id_a, audio_a = future_audio_a.result()
129
+ generation_id_b, audio_b = future_audio_b.result()
130
 
131
+ options = [
132
+ (provider_a, audio_a, generation_id_a),
133
+ (provider_b, audio_b, generation_id_b),
134
+ ]
135
  random.shuffle(options)
136
+ options_map: OptionMap = {
137
+ constants.OPTION_A: options[0][0],
138
+ constants.OPTION_B: options[1][0],
139
+ }
140
+ option_a_audio, option_b_audio = options[0][1], options[1][1]
141
+ option_a_generation_id, option_b_generation_id = options[0][2], options[1][2]
142
 
143
  return (
144
  gr.update(value=option_a_audio, visible=True, autoplay=True),
145
  gr.update(value=option_b_audio, visible=True),
146
  options_map,
147
  option_b_audio,
148
+ comparison_type,
149
+ option_a_generation_id,
150
+ option_b_generation_id,
151
+ text_modified,
152
+ text,
153
+ character_description,
154
  )
155
  except ElevenLabsError as ee:
156
  logger.error(f"ElevenLabsError while synthesizing speech from text: {str(ee)}")
 
168
 
169
 
170
  def vote(
171
+ vote_submitted: bool,
172
+ option_map: OptionMap,
173
+ selected_button: str,
174
+ comparison_type: ComparisonType,
175
+ option_a_generation_id: str,
176
+ option_b_generation_id: str,
177
+ text_modified: bool,
178
+ character_description: str,
179
+ text: str,
180
  ) -> Tuple[bool, gr.update, gr.update, gr.update]:
181
  """
182
  Handles user voting.
 
201
  if not option_map or vote_submitted:
202
  return gr.skip(), gr.skip(), gr.skip(), gr.skip()
203
 
204
+ option_a_selected = selected_button == constants.VOTE_FOR_OPTION_A
205
  selected_option, other_option = (
206
+ (constants.OPTION_A, constants.OPTION_B)
207
+ if option_a_selected
208
+ else (constants.OPTION_B, constants.OPTION_A)
209
  )
210
  selected_provider = option_map.get(selected_option)
211
  other_provider = option_map.get(other_option)
212
 
213
  # Build button labels, displaying the provider and voice name, appending the trophy emoji to the selected option.
214
+ selected_label = f"{selected_provider} {constants.TROPHY_EMOJI}"
215
  other_label = f"{other_provider}"
216
 
217
+ # Report voting results to be persisted to results DB
218
+ voting_results: VotingResults = {
219
+ "comparison_type": comparison_type,
220
+ "winning_provider": selected_provider,
221
+ "winning_option": selected_option,
222
+ "option_a_provider": option_map.get(constants.OPTION_A),
223
+ "option_b_provider": option_map.get(constants.OPTION_B),
224
+ "option_a_generation_id": option_a_generation_id,
225
+ "option_b_generation_id": option_b_generation_id,
226
+ "character_description": character_description,
227
+ "text": text,
228
+ "is_custom_text": text_modified,
229
+ }
230
+ # TODO: Currently logging the results until we hook the API for writing results to DB
231
+ logger.info("Voting results:\n%s", json.dumps(voting_results, indent=4))
232
+
233
  return (
234
  True,
235
  (
 
263
  return (
264
  gr.update(value=None),
265
  gr.update(value=None, autoplay=False),
266
+ gr.update(value=constants.VOTE_FOR_OPTION_A, variant="secondary"),
267
+ gr.update(value=constants.VOTE_FOR_OPTION_B, variant="secondary"),
268
  None,
269
  None,
270
  False,
 
272
 
273
 
274
  def build_input_section() -> Tuple[gr.Markdown, gr.Dropdown, gr.Textbox, gr.Button]:
275
+ """Builds the input section including instructions, sample character description dropdown, character description input, and generate button"""
276
  instructions = gr.Markdown(
277
  """
278
+ 1. **Enter or Generate Text:** Type directly in the text box—or enter a character description and click “Generate Text” to auto-populate. Edit as needed.
279
  2. **Synthesize Speech:** Click “Synthesize Speech” to generate two audio outputs.
280
  3. **Listen & Compare:** Play back both audio options to hear the differences.
281
  4. **Vote for Your Favorite:** Click “Vote for Option A” or “Vote for Option B” to cast your vote.
282
  """
283
  )
284
+ sample_character_description_dropdown = gr.Dropdown(
285
+ choices=list(constants.SAMPLE_CHARACTER_DESCRIPTIONS.keys()),
286
+ label="Choose a sample character description (or enter your own)",
287
  value=None,
288
  interactive=True,
289
  )
290
+ character_description_input = gr.Textbox(
291
+ label="Character Description",
292
+ placeholder="Enter your character description to be used to generate text and a novel voice...",
293
  lines=3,
294
  max_lines=8,
295
+ max_length=constants.CHARACTER_DESCRIPTION_MAX_LENGTH,
296
  show_copy_button=True,
297
  )
298
  generate_text_button = gr.Button("Generate text", variant="secondary")
299
  return (
300
  instructions,
301
+ sample_character_description_dropdown,
302
+ character_description_input,
303
  generate_text_button,
304
  )
305
 
 
315
  autoscroll=False,
316
  lines=3,
317
  max_lines=8,
318
+ max_length=constants.CHARACTER_DESCRIPTION_MAX_LENGTH,
319
  show_copy_button=True,
320
  )
321
  synthesize_speech_button = gr.Button("Synthesize speech", variant="primary")
322
  with gr.Row(equal_height=True):
323
  option_a_audio_player = gr.Audio(
324
+ label=constants.OPTION_A, type="filepath", interactive=False
325
  )
326
  option_b_audio_player = gr.Audio(
327
+ label=constants.OPTION_B, type="filepath", interactive=False
328
  )
329
  with gr.Row(equal_height=True):
330
+ vote_button_a = gr.Button(constants.VOTE_FOR_OPTION_A, interactive=False)
331
+ vote_button_b = gr.Button(constants.VOTE_FOR_OPTION_B, interactive=False)
332
  return (
333
  text_input,
334
  synthesize_speech_button,
 
357
  gr.Markdown("# Expressive TTS Arena")
358
 
359
  # Build generate text section
360
+ (
361
+ instructions,
362
+ sample_character_description_dropdown,
363
+ character_description_input,
364
+ generate_text_button,
365
+ ) = build_input_section()
366
 
367
  # Build synthesize speech section
368
  (
 
376
 
377
  # --- UI state components ---
378
 
379
+ # Track text used for speech synthesis
380
+ text_state = gr.State("")
381
+ # Track character description used for text and voice generation
382
+ character_description_state = gr.State("")
383
+ # Track comparison type (which set of providers are being compared)
384
+ comparison_type_state = gr.State()
385
+ # Track generation ID for Option A
386
+ option_a_generation_id_state = gr.State()
387
+ # Track generation ID for Option B
388
+ option_b_generation_id_state = gr.State()
389
+ # Track whether text that was used was generated or modified/custom
390
+ text_modified_state = gr.State()
391
  # Track generated text state
392
  generated_text_state = gr.State("")
393
  # Track generated audio for option B for playing automatically after option 1 audio finishes
 
399
 
400
  # --- Register event handlers ---
401
 
402
+ # When a sample character description is chosen, update the character description textbox
403
+ sample_character_description_dropdown.change(
404
+ fn=lambda choice: constants.SAMPLE_CHARACTER_DESCRIPTIONS.get(choice, ""),
405
+ inputs=[sample_character_description_dropdown],
406
+ outputs=[character_description_input],
407
  )
408
 
409
  # Generate text button click handler chain:
 
416
  outputs=[generate_text_button],
417
  ).then(
418
  fn=generate_text,
419
+ inputs=[character_description_input],
420
  outputs=[text_input, generated_text_state],
421
  ).then(
422
  fn=lambda: gr.update(interactive=True),
 
451
  ],
452
  ).then(
453
  fn=text_to_speech,
454
+ inputs=[character_description_input, text_input, generated_text_state],
455
  outputs=[
456
  option_a_audio_player,
457
  option_b_audio_player,
458
  option_map_state,
459
  option_b_audio_state,
460
+ comparison_type_state,
461
+ option_a_generation_id_state,
462
+ option_b_generation_id_state,
463
+ text_modified_state,
464
+ text_state,
465
+ character_description_state,
466
  ],
467
  ).then(
468
  fn=lambda: (
 
483
  vote_button_a,
484
  vote_button_b,
485
  synthesize_speech_button,
486
+ comparison_type_state,
487
+ option_a_generation_id_state,
488
+ option_b_generation_id_state,
489
+ text_modified_state,
490
+ character_description_state,
491
+ text_state,
492
  ],
493
  )
494
  vote_button_b.click(
src/constants.py CHANGED
@@ -8,16 +8,19 @@ This module defines global constants used throughout the project.
8
  from typing import List
9
 
10
  # Third-Party Library Imports
11
- from src.types import OptionKey, TTSProviderName
12
 
13
 
14
  # UI constants
15
  HUME_AI: TTSProviderName = "Hume AI"
16
  ELEVENLABS: TTSProviderName = "ElevenLabs"
17
- TTS_PROVIDERS: List[TTSProviderName]
18
 
19
- PROMPT_MIN_LENGTH: int = 20
20
- PROMPT_MAX_LENGTH: int = 800
 
 
 
21
 
22
  OPTION_A: OptionKey = "Option A"
23
  OPTION_B: OptionKey = "Option B"
@@ -26,41 +29,27 @@ VOTE_FOR_OPTION_A: str = "Vote for option A"
26
  VOTE_FOR_OPTION_B: str = "Vote for option B"
27
 
28
 
29
- # A collection of pre-defined prompts categorized by theme, used to provide users with
30
- # inspiration for generating creative text for expressive TTS.
31
- SAMPLE_PROMPTS: dict = {
32
- "🚀 Dramatic Monologue (Stranded Astronaut)": (
33
- "Create a poignant final transmission from a lone astronaut on Mars to mission control. "
34
- "Voice: low, measured pace, with subtle tremors of emotion. Content should move from "
35
- "awe-struck description of the Martian sunset to peaceful acceptance. Include natural "
36
- "pauses for emotional weight. Keep the tone intimate and contemplative, as if speaking "
37
- "softly into a radio mic. End with dignified finality."
38
  ),
39
- "📜 Poetic Sonnet (The Passage of Time)": (
40
- "Craft a sonnet about time's flow, suitable for measured, resonant delivery. "
41
- "Voice: clear, rhythmic, with careful emphasis on key metaphors. Flow from quiet "
42
- "reflection to profound realization. Include strategic pauses between quatrains. "
43
- "Balance crisp consonants with flowing vowels for musical quality. Maintain consistent "
44
- "meter for natural speech rhythm."
45
  ),
46
- "🐱 Whimsical Children's Story (Talking Cat)": (
47
- "Tell a playful tale of a curious cat's magical library adventure. "
48
- "Voice: bright, energetic, with clear character distinctions. Mix whispered "
49
- "conspiracies with excited discoveries. Include dramatic pauses for suspense "
50
- "and giggles. Use bouncy rhythm for action scenes, slower pace for wonder. "
51
- "End with warm, gentle closure perfect for bedtime."
52
  ),
53
- "🔥 Intense Speech (Freedom & Justice)": (
54
- "Deliver a rousing resistance speech that builds from quiet determination to powerful resolve. "
55
- "Voice: start controlled and intense, rise to passionate crescendo. Include strategic "
56
- "pauses for impact. Mix shorter, punchy phrases with flowing calls to action. "
57
- "Use strong consonants and open vowels for projection. End with unshakeable conviction."
58
  ),
59
- "👻 Mysterious Horror Scene (Haunted Lighthouse)": (
60
- "Narrate a spine-chilling lighthouse encounter that escalates from unease to revelation. "
61
- "Voice: hushed, tense, with subtle dynamic range. Mix whispers with clearer tones. "
62
- "Include extended pauses for tension. Use sibilants and soft consonants for "
63
- "atmospheric effect. Build rhythm with the lighthouse's beam pattern. End with haunting "
64
- "revelation."
65
  ),
66
  }
 
8
  from typing import List
9
 
10
  # Third-Party Library Imports
11
+ from src.types import ComparisonType, OptionKey, TTSProviderName
12
 
13
 
14
  # UI constants
15
  HUME_AI: TTSProviderName = "Hume AI"
16
  ELEVENLABS: TTSProviderName = "ElevenLabs"
17
+ TTS_PROVIDERS: List[TTSProviderName] = ["Hume AI", "ElevenLabs"]
18
 
19
+ HUME_TO_HUME: ComparisonType = "Hume AI - Hume AI"
20
+ HUME_TO_ELEVENLABS: ComparisonType = "Hume AI - ElevenLabs"
21
+
22
+ CHARACTER_DESCRIPTION_MIN_LENGTH: int = 20
23
+ CHARACTER_DESCRIPTION_MAX_LENGTH: int = 800
24
 
25
  OPTION_A: OptionKey = "Option A"
26
  OPTION_B: OptionKey = "Option B"
 
29
  VOTE_FOR_OPTION_B: str = "Vote for option B"
30
 
31
 
32
+ # A collection of pre-defined character descriptions categorized by theme, used to provide users with
33
+ # inspiration for generating creative text for expressive TTS, and generating novel voices.
34
+ SAMPLE_CHARACTER_DESCRIPTIONS: dict = {
35
+ "🚀 Stranded Astronaut": (
36
+ "A lone astronaut whose voice mirrors the silent vastness of space—a low, steady tone imbued with isolation and quiet wonder. "
37
+ "It carries the measured resolve of someone sending a final transmission, with an undercurrent of wistful melancholy."
 
 
 
38
  ),
39
+ "📜 Timeless Poet": (
40
+ "An ageless poet with a voice that flows like gentle verse—a soft, reflective tone marked by deliberate pauses. "
41
+ "It speaks with the measured cadence of classic sonnets, evoking both the fragile beauty of time and heartfelt introspection."
 
 
 
42
  ),
43
+ "🐱 Whimsical Feline": (
44
+ "A mischievous cat whose voice is playful yet mysterious—light, quick-witted, and infused with an enchanting purr. "
45
+ "It hints at secret adventures and hidden charm, balancing exuberance with a subtle, smooth allure."
 
 
 
46
  ),
47
+ "🔥 Revolutionary Orator": (
48
+ "A defiant orator whose voice builds from quiet determination to passionate fervor—a clear, commanding tone that resonates with conviction. "
49
+ "It starts measured and resolute, then rises to a crescendo of fervor, punctuated by deliberate pauses that emphasize each rallying cry."
 
 
50
  ),
51
+ "👻 Haunted Keeper": (
52
+ "A solitary lighthouse keeper with a voice that carries the weight of forgotten storms—a soft, measured tone with an echo of sorrow. "
53
+ "It speaks as if whispering long-held secrets in the dark, blending quiet melancholy with an air of enduring mystery."
 
 
 
54
  ),
55
  }
src/integrations/anthropic_api.py CHANGED
@@ -98,6 +98,25 @@ Remember: A shorter, complete response is ALWAYS better than a longer, truncated
98
  """
99
  return Anthropic(api_key=self.api_key)
100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
  class AnthropicError(Exception):
103
  """Custom exception for errors related to the Anthropic API."""
@@ -118,12 +137,12 @@ anthropic_config = AnthropicConfig()
118
  after=after_log(logger, logging.DEBUG),
119
  reraise=True,
120
  )
121
- def generate_text_with_claude(prompt: str) -> str:
122
  """
123
  Generates text using Claude (Anthropic LLM) via the Anthropic SDK.
124
 
125
  Args:
126
- prompt (str): The input prompt for Claude.
127
 
128
  Returns:
129
  str: The generated text.
@@ -131,8 +150,10 @@ def generate_text_with_claude(prompt: str) -> str:
131
  Raises:
132
  AnthropicError: If there is an error communicating with the Anthropic API.
133
  """
 
 
134
  logger.debug(
135
- f"Generating text with Claude. Prompt length: {len(prompt)} characters."
136
  )
137
 
138
  response = None
 
98
  """
99
  return Anthropic(api_key=self.api_key)
100
 
101
+ def build_expressive_prompt(self, character_description: str) -> str:
102
+ """
103
+ Constructs and returns a prompt based solely on the provided voice description.
104
+ The returned prompt is intended to instruct Claude to generate expressive text from a character,
105
+ capturing the character's personality and emotional nuance, without including the system prompt.
106
+
107
+ Args:
108
+ character_description (str): A description of the character's voice and persona.
109
+
110
+ Returns:
111
+ str: The prompt to be passed to the Anthropic API.
112
+ """
113
+ prompt = (
114
+ f"Character Description: {character_description}\n\n"
115
+ "Based on the above character description, please generate a line of dialogue that captures the character's unique personality, emotional depth, and distinctive tone. "
116
+ "The response should sound like something the character would naturally say, reflecting their background and emotional state, and be fully developed for text-to-speech synthesis."
117
+ )
118
+ return prompt
119
+
120
 
121
  class AnthropicError(Exception):
122
  """Custom exception for errors related to the Anthropic API."""
 
137
  after=after_log(logger, logging.DEBUG),
138
  reraise=True,
139
  )
140
+ def generate_text_with_claude(character_description: str) -> str:
141
  """
142
  Generates text using Claude (Anthropic LLM) via the Anthropic SDK.
143
 
144
  Args:
145
+ character_description (str): The input character description used to assist with generating text with Claude.
146
 
147
  Returns:
148
  str: The generated text.
 
150
  Raises:
151
  AnthropicError: If there is an error communicating with the Anthropic API.
152
  """
153
+ # Build prompt for claude with character description
154
+ prompt = anthropic_config.build_expressive_prompt(character_description)
155
  logger.debug(
156
+ f"Generating text with Claude. Character description length: {len(prompt)} characters."
157
  )
158
 
159
  response = None
src/integrations/elevenlabs_api.py CHANGED
@@ -76,16 +76,18 @@ elevenlabs_config = ElevenLabsConfig()
76
  after=after_log(logger, logging.DEBUG),
77
  reraise=True,
78
  )
79
- def text_to_speech_with_elevenlabs(prompt: str, text: str) -> bytes:
80
  """
81
  Synthesizes text to speech using the ElevenLabs TTS API, processes audio data, and writes audio to a file.
82
 
83
  Args:
84
- prompt (str): The original user prompt used as the voice description.
85
  text (str): The text to be synthesized to speech.
86
 
87
  Returns:
88
- str: The relative path for the file the synthesized audio was written to.
 
 
89
 
90
  Raises:
91
  ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
@@ -94,12 +96,10 @@ def text_to_speech_with_elevenlabs(prompt: str, text: str) -> bytes:
94
  f"Synthesizing speech with ElevenLabs. Text length: {len(text)} characters."
95
  )
96
 
97
- request_body = {"text": text, "voice_description": prompt}
98
-
99
  try:
100
  # Synthesize speech using the ElevenLabs SDK
101
  response = elevenlabs_config.client.text_to_voice.create_previews(
102
- voice_description=prompt,
103
  text=text,
104
  output_format=elevenlabs_config.output_format,
105
  )
@@ -117,7 +117,7 @@ def text_to_speech_with_elevenlabs(prompt: str, text: str) -> bytes:
117
  filename = f"{generated_voice_id}.mp3"
118
 
119
  # Write audio to file and return the relative path
120
- return save_base64_audio_to_file(base64_audio, filename)
121
 
122
  except Exception as e:
123
  logger.exception(f"Error synthesizing speech with ElevenLabs: {e}")
 
76
  after=after_log(logger, logging.DEBUG),
77
  reraise=True,
78
  )
79
+ def text_to_speech_with_elevenlabs(character_description: str, text: str) -> bytes:
80
  """
81
  Synthesizes text to speech using the ElevenLabs TTS API, processes audio data, and writes audio to a file.
82
 
83
  Args:
84
+ character_description (str): The original user character description used as the voice description.
85
  text (str): The text to be synthesized to speech.
86
 
87
  Returns:
88
+ Tuple[None, str]: A tuple containing:
89
+ - generation_id (None): We do not record the generation ID for ElevenLabs, but return None for uniformity across TTS integrations
90
+ - file_path (str): The relative path to the file where the synthesized audio was saved.
91
 
92
  Raises:
93
  ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
 
96
  f"Synthesizing speech with ElevenLabs. Text length: {len(text)} characters."
97
  )
98
 
 
 
99
  try:
100
  # Synthesize speech using the ElevenLabs SDK
101
  response = elevenlabs_config.client.text_to_voice.create_previews(
102
+ voice_description=character_description,
103
  text=text,
104
  output_format=elevenlabs_config.output_format,
105
  )
 
117
  filename = f"{generated_voice_id}.mp3"
118
 
119
  # Write audio to file and return the relative path
120
+ return None, save_base64_audio_to_file(base64_audio, filename)
121
 
122
  except Exception as e:
123
  logger.exception(f"Error synthesizing speech with ElevenLabs: {e}")
src/integrations/hume_api.py CHANGED
@@ -86,25 +86,29 @@ hume_config = HumeConfig()
86
  after=after_log(logger, logging.DEBUG),
87
  reraise=True,
88
  )
89
- def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
90
  """
91
  Synthesizes text to speech using the Hume TTS API, processes audio data, and writes audio to a file.
92
 
93
  Args:
94
- prompt (str): The original user prompt to use as the description for generating the voice.
95
  text (str): The generated text to be converted to speech.
96
 
97
  Returns:
98
- str: The relative path for the file the synthesized audio was written to.
 
 
99
 
100
  Raises:
101
  HumeError: If there is an error communicating with the Hume TTS API or parsing the response.
102
  """
103
  logger.debug(
104
- f"Processing TTS with Hume. Prompt length: {len(prompt)} characters. Text length: {len(text)} characters."
105
  )
106
 
107
- request_body = {"utterances": [{"text": text, "description": prompt}]}
 
 
108
 
109
  try:
110
  # Synthesize speech using the Hume TTS API
@@ -129,7 +133,7 @@ def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
129
  filename = f"{generation_id}.mp3"
130
 
131
  # Write audio to file and return the relative path
132
- return save_base64_audio_to_file(base64_audio, filename)
133
 
134
  except Exception as e:
135
  logger.exception(f"Error synthesizing speech with Hume: {e}")
 
86
  after=after_log(logger, logging.DEBUG),
87
  reraise=True,
88
  )
89
+ def text_to_speech_with_hume(character_description: str, text: str) -> bytes:
90
  """
91
  Synthesizes text to speech using the Hume TTS API, processes audio data, and writes audio to a file.
92
 
93
  Args:
94
+ character_description (str): The original user character description to use as the description for generating the voice.
95
  text (str): The generated text to be converted to speech.
96
 
97
  Returns:
98
+ Tuple[str, str]: A tuple containing:
99
+ - generation_id (str): The generation ID returned from the Hume API.
100
+ - file_path (str): The relative path to the file where the synthesized audio was saved.
101
 
102
  Raises:
103
  HumeError: If there is an error communicating with the Hume TTS API or parsing the response.
104
  """
105
  logger.debug(
106
+ f"Processing TTS with Hume. Prompt length: {len(character_description)} characters. Text length: {len(text)} characters."
107
  )
108
 
109
+ request_body = {
110
+ "utterances": [{"text": text, "description": character_description}]
111
+ }
112
 
113
  try:
114
  # Synthesize speech using the Hume TTS API
 
133
  filename = f"{generation_id}.mp3"
134
 
135
  # Write audio to file and return the relative path
136
+ return generation_id, save_base64_audio_to_file(base64_audio, filename)
137
 
138
  except Exception as e:
139
  logger.exception(f"Error synthesizing speech with Hume: {e}")
src/types.py CHANGED
@@ -27,7 +27,7 @@ OptionMap = Dict[OptionKey, TTSProviderName]
27
  class VotingResults(TypedDict):
28
  """Voting results data structure representing values we want to persist to the votes DB"""
29
 
30
- comparison_type: str
31
  winning_provider: TTSProviderName
32
  winning_option: OptionKey
33
  option_a_provider: TTSProviderName
 
27
  class VotingResults(TypedDict):
28
  """Voting results data structure representing values we want to persist to the votes DB"""
29
 
30
+ comparison_type: ComparisonType
31
  winning_provider: TTSProviderName
32
  winning_option: OptionKey
33
  option_a_provider: TTSProviderName
src/utils.py CHANGED
@@ -7,7 +7,7 @@ These functions provide reusable logic to simplify code in other modules.
7
  Functions:
8
  - truncate_text: Truncates a string to a specified length with ellipses. (used for logging)
9
  - validate_env_var: Ensures the presence of a specific environment variable and retrieves its value.
10
- - validate_prompt_length: Ensures that a prompt does not exceed the specified minimum or maximum length.
11
  """
12
 
13
  # Standard Library Imports
@@ -16,6 +16,10 @@ import os
16
 
17
  # Local Application Imports
18
  from src.config import AUDIO_DIR, logger
 
 
 
 
19
 
20
 
21
  def truncate_text(text: str, max_length: int = 50) -> str:
@@ -80,42 +84,42 @@ def validate_env_var(var_name: str) -> str:
80
  return value
81
 
82
 
83
- def validate_prompt_length(prompt: str, max_length: int, min_length: int) -> None:
84
  """
85
- Validates that a prompt is within specified minimum and maximum length limits.
86
 
87
  Args:
88
- prompt (str): The input prompt to validate.
89
- max_length (int): The maximum allowed length for the prompt.
90
- min_length (int): The minimum required length for the prompt.
91
 
92
  Raises:
93
- ValueError: If the prompt is empty, too short, or exceeds max_length.
94
 
95
  Example:
96
- >>> validate_prompt_length("Hello world", max_length=500, min_length=5)
97
  # Passes validation
98
 
99
- >>> validate_prompt_length("", max_length=300, min_length=10)
100
- # Raises ValueError: "Prompt must be at least 10 characters long."
101
  """
102
- stripped_prompt = prompt.strip()
103
- prompt_length = len(stripped_prompt)
104
 
105
- logger.debug(f"Prompt length being validated: {prompt_length} characters")
 
 
106
 
107
- if prompt_length < min_length:
108
  raise ValueError(
109
- f"Your prompt is too short. Please enter at least {min_length} characters. "
110
- f"(Current length: {prompt_length})"
111
  )
112
- if prompt_length > max_length:
113
  raise ValueError(
114
- f"Your prompt is too long. Please limit it to {max_length} characters. "
115
- f"(Current length: {prompt_length})"
116
  )
117
  logger.debug(
118
- f"Prompt length validation passed for prompt: {truncate_text(stripped_prompt)}"
119
  )
120
 
121
 
 
7
  Functions:
8
  - truncate_text: Truncates a string to a specified length with ellipses. (used for logging)
9
  - validate_env_var: Ensures the presence of a specific environment variable and retrieves its value.
10
+ - validate_character_description_length: Ensures that a voice description does not exceed the specified minimum or maximum length.
11
  """
12
 
13
  # Standard Library Imports
 
16
 
17
  # Local Application Imports
18
  from src.config import AUDIO_DIR, logger
19
+ from src.constants import (
20
+ CHARACTER_DESCRIPTION_MIN_LENGTH,
21
+ CHARACTER_DESCRIPTION_MAX_LENGTH,
22
+ )
23
 
24
 
25
  def truncate_text(text: str, max_length: int = 50) -> str:
 
84
  return value
85
 
86
 
87
+ def validate_character_description_length(character_description: str) -> None:
88
  """
89
+ Validates that a voice description is within specified minimum and maximum length limits.
90
 
91
  Args:
92
+ character_description (str): The input character description to validate.
 
 
93
 
94
  Raises:
95
+ ValueError: If the character description is empty, too short, or exceeds max length.
96
 
97
  Example:
98
+ >>> validate_character_description_length("This is a character description.")
99
  # Passes validation
100
 
101
+ >>> validate_character_description_length("")
102
+ # Raises ValueError: "Voice Description must be at least 20 characters long."
103
  """
104
+ stripped_character_description = character_description.strip()
105
+ character_description_length = len(stripped_character_description)
106
 
107
+ logger.debug(
108
+ f"Voice description length being validated: {character_description_length} characters"
109
+ )
110
 
111
+ if character_description_length < CHARACTER_DESCRIPTION_MIN_LENGTH:
112
  raise ValueError(
113
+ f"Your character description is too short. Please enter at least {CHARACTER_DESCRIPTION_MIN_LENGTH} characters. "
114
+ f"(Current length: {character_description_length})"
115
  )
116
+ if character_description_length > CHARACTER_DESCRIPTION_MAX_LENGTH:
117
  raise ValueError(
118
+ f"Your character description is too long. Please limit it to {CHARACTER_DESCRIPTION_MAX_LENGTH} characters. "
119
+ f"(Current length: {character_description_length})"
120
  )
121
  logger.debug(
122
+ f"Character description length validation passed for character_description: {truncate_text(stripped_character_description)}"
123
  )
124
 
125