Spaces:

HumeAI
/

expressive-tts-arena

Running

App Files Files Community

zach commited on Feb 7

Commit

5bf19b3

1 Parent(s): 701fd0f

Update application flow to accept a character description and normalize a prompt for claude

Browse files

Files changed (8) hide show

README.md +1 -1
src/app.py +128 -69
src/constants.py +25 -36
src/integrations/anthropic_api.py +24 -3
src/integrations/elevenlabs_api.py +7 -7
src/integrations/hume_api.py +10 -6
src/types.py +1 -1
src/utils.py +24 -20

README.md CHANGED Viewed

@@ -98,7 +98,7 @@ Expressive TTS Arena/
 ## User Flow
-1. **Enter or Generate Text:** Type directly in the Text box, or optionally enter a Prompt, click "Generate text", and edit if needed.
 2. **Synthesize Speech:** Click "Synthesize speech" to generate two audio outputs.
 3. **Listen & Compare:** Playback both options (A & B) to hear the differences.
 4. **Vote for Your Favorite:** Click "Vote for option A" or "Vote for option B" to choose your favorite.

 ## User Flow
+1. **Enter or Generate Text:** Type directly in the Text box, or optionally enter a Character description, click "Generate text", and edit if needed.
 2. **Synthesize Speech:** Click "Synthesize speech" to generate two audio outputs.
 3. **Listen & Compare:** Playback both options (A & B) to hear the differences.
 4. **Vote for Your Favorite:** Click "Vote for option A" or "Vote for option B" to choose your favorite.

src/app.py CHANGED Viewed

@@ -3,8 +3,8 @@ app.py
 Gradio UI for interacting with the Anthropic API, Hume TTS API, and ElevenLabs TTS API.
-Users enter a prompt, which is processed using Claude by Anthropic to generate text.
-The text is then synthesized into speech using both Hume and ElevenLabs text-to-speech (TTS) APIs.
 Users can compare the outputs and vote for their favorite in an interactive UI.
 """
@@ -19,19 +19,7 @@ import gradio as gr
 # Local Application Imports
 from src.config import AUDIO_DIR, logger
-from src.constants import (
-    ELEVENLABS,
-    HUME_AI,
-    OPTION_A,
-    OPTION_B,
-    PROMPT_MAX_LENGTH,
-    PROMPT_MIN_LENGTH,
-    SAMPLE_PROMPTS,
-    TROPHY_EMOJI,
-    TTS_PROVIDERS,
-    VOTE_FOR_OPTION_A,
-    VOTE_FOR_OPTION_B,
-)
 from src.integrations import (
     AnthropicError,
     ElevenLabsError,
@@ -41,18 +29,18 @@ from src.integrations import (
     text_to_speech_with_hume,
 )
 from src.theme import CustomTheme
-from src.types import OptionMap
-from src.utils import validate_prompt_length
 def generate_text(
-    prompt: str,
 ) -> Tuple[Union[str, gr.update], gr.update]:
     """
-    Validates the prompt and generates text using Anthropic API.
     Args:
-        prompt (str): The user-provided text prompt.
     Returns:
         Tuple containing:
@@ -63,13 +51,13 @@ def generate_text(
         gr.Error: On validation or API errors.
     """
     try:
-        validate_prompt_length(prompt, PROMPT_MAX_LENGTH, PROMPT_MIN_LENGTH)
     except ValueError as ve:
         logger.warning(f"Validation error: {ve}")
         raise gr.Error(str(ve))
     try:
-        generated_text = generate_text_with_claude(prompt)
         logger.info(f"Generated text ({len(generated_text)} characters).")
         return gr.update(value=generated_text), generated_text
     except AnthropicError as ae:
@@ -83,7 +71,7 @@ def generate_text(
 def text_to_speech(
-    prompt: str, text: str, generated_text_state: str
 ) -> Tuple[gr.update, gr.update, dict, Union[str, None]]:
     """
     Synthesizes two text to speech outputs, loads the two audio players with the
@@ -92,7 +80,7 @@ def text_to_speech(
         - 50% chance to synthesize two Hume outputs.
     Args:
-        prompt (str): The original prompt.
         text (str): The text to synthesize to speech.
     Returns:
@@ -110,41 +98,59 @@ def text_to_speech(
         raise gr.Error("Please generate or enter text to synthesize.")
     # Hume AI always included in comparison
-    provider_a = HUME_AI
     # If not using generated text, then only compare Hume to Hume
-    provider_b = (
-        HUME_AI if text != generated_text_state else random.choice(TTS_PROVIDERS)
     )
     try:
         with ThreadPoolExecutor(max_workers=2) as executor:
-            future_audio_a = executor.submit(text_to_speech_with_hume, prompt, text)
             match provider_b:
-                case ELEVENLABS:
                     future_audio_b = executor.submit(
-                        text_to_speech_with_elevenlabs, prompt, text
                     )
-                case HUME_AI:
                     future_audio_b = executor.submit(
-                        text_to_speech_with_hume, prompt, text
                     )
                 case _:
                     raise ValueError(f"Unsupported provider: {provider_b}")
-            audio_a = future_audio_a.result()
-            audio_b = future_audio_b.result()
-        options = [(audio_a, provider_a), (audio_b, provider_b)]
         random.shuffle(options)
-        option_a_audio, option_b_audio = options[0][0], options[1][0]
-        options_map: OptionMap = {OPTION_A: options[0][1], OPTION_B: options[1][1]}
         return (
             gr.update(value=option_a_audio, visible=True, autoplay=True),
             gr.update(value=option_b_audio, visible=True),
             options_map,
             option_b_audio,
         )
     except ElevenLabsError as ee:
         logger.error(f"ElevenLabsError while synthesizing speech from text: {str(ee)}")
@@ -162,7 +168,15 @@ def text_to_speech(
 def vote(
-    vote_submitted: bool, option_map: OptionMap, selected_button: str
 ) -> Tuple[bool, gr.update, gr.update, gr.update]:
     """
     Handles user voting.
@@ -187,17 +201,35 @@ def vote(
     if not option_map or vote_submitted:
         return gr.skip(), gr.skip(), gr.skip(), gr.skip()
-    option_a_selected = selected_button == VOTE_FOR_OPTION_A
     selected_option, other_option = (
-        (OPTION_A, OPTION_B) if option_a_selected else (OPTION_B, OPTION_A)
     )
     selected_provider = option_map.get(selected_option)
     other_provider = option_map.get(other_option)
     # Build button labels, displaying the provider and voice name, appending the trophy emoji to the selected option.
-    selected_label = f"{selected_provider} {TROPHY_EMOJI}"
     other_label = f"{other_provider}"
     return (
         True,
         (
@@ -231,8 +263,8 @@ def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None,
     return (
         gr.update(value=None),
         gr.update(value=None, autoplay=False),
-        gr.update(value=VOTE_FOR_OPTION_A, variant="secondary"),
-        gr.update(value=VOTE_FOR_OPTION_B, variant="secondary"),
         None,
         None,
         False,
@@ -240,34 +272,34 @@ def reset_ui() -> Tuple[gr.update, gr.update, gr.update, gr.update, None, None,
 def build_input_section() -> Tuple[gr.Markdown, gr.Dropdown, gr.Textbox, gr.Button]:
-    """Builds the input section including instructions, sample prompt dropdown, prompt input, and generate button"""
     instructions = gr.Markdown(
         """
-        1. **Enter or Generate Text:** Type directly in the text box—or enter a prompt and click “Generate Text” to auto-populate. Edit as needed.
         2. **Synthesize Speech:** Click “Synthesize Speech” to generate two audio outputs.
         3. **Listen & Compare:** Play back both audio options to hear the differences.
         4. **Vote for Your Favorite:** Click “Vote for Option A” or “Vote for Option B” to cast your vote.
         """
     )
-    sample_prompt_dropdown = gr.Dropdown(
-        choices=list(SAMPLE_PROMPTS.keys()),
-        label="Choose a sample prompt (or enter your own)",
         value=None,
         interactive=True,
     )
-    prompt_input = gr.Textbox(
-        label="Prompt",
-        placeholder="Enter your prompt...",
         lines=3,
         max_lines=8,
-        max_length=PROMPT_MAX_LENGTH,
         show_copy_button=True,
     )
     generate_text_button = gr.Button("Generate text", variant="secondary")
     return (
         instructions,
-        sample_prompt_dropdown,
-        prompt_input,
         generate_text_button,
     )
@@ -283,20 +315,20 @@ def build_output_section() -> (
         autoscroll=False,
         lines=3,
         max_lines=8,
-        max_length=PROMPT_MAX_LENGTH,
         show_copy_button=True,
     )
     synthesize_speech_button = gr.Button("Synthesize speech", variant="primary")
     with gr.Row(equal_height=True):
         option_a_audio_player = gr.Audio(
-            label=OPTION_A, type="filepath", interactive=False
         )
         option_b_audio_player = gr.Audio(
-            label=OPTION_B, type="filepath", interactive=False
         )
     with gr.Row(equal_height=True):
-        vote_button_a = gr.Button(VOTE_FOR_OPTION_A, interactive=False)
-        vote_button_b = gr.Button(VOTE_FOR_OPTION_B, interactive=False)
     return (
         text_input,
         synthesize_speech_button,
@@ -325,9 +357,12 @@ def build_gradio_interface() -> gr.Blocks:
         gr.Markdown("# Expressive TTS Arena")
         # Build generate text section
-        (instructions, sample_prompt_dropdown, prompt_input, generate_text_button) = (
-            build_input_section()
-        )
         # Build synthesize speech section
         (
@@ -341,6 +376,18 @@ def build_gradio_interface() -> gr.Blocks:
         # --- UI state components ---
         # Track generated text state
         generated_text_state = gr.State("")
         # Track generated audio for option B for playing automatically after option 1 audio finishes
@@ -352,11 +399,11 @@ def build_gradio_interface() -> gr.Blocks:
         # --- Register event handlers ---
-        # When a sample prompt is chosen, update the prompt textbox
-        sample_prompt_dropdown.change(
-            fn=lambda choice: SAMPLE_PROMPTS.get(choice, ""),
-            inputs=[sample_prompt_dropdown],
-            outputs=[prompt_input],
         )
         # Generate text button click handler chain:
@@ -369,7 +416,7 @@ def build_gradio_interface() -> gr.Blocks:
             outputs=[generate_text_button],
         ).then(
             fn=generate_text,
-            inputs=[prompt_input],
             outputs=[text_input, generated_text_state],
         ).then(
             fn=lambda: gr.update(interactive=True),
@@ -404,12 +451,18 @@ def build_gradio_interface() -> gr.Blocks:
             ],
         ).then(
             fn=text_to_speech,
-            inputs=[prompt_input, text_input, generated_text_state],
             outputs=[
                 option_a_audio_player,
                 option_b_audio_player,
                 option_map_state,
                 option_b_audio_state,
             ],
         ).then(
             fn=lambda: (
@@ -430,6 +483,12 @@ def build_gradio_interface() -> gr.Blocks:
                 vote_button_a,
                 vote_button_b,
                 synthesize_speech_button,
             ],
         )
         vote_button_b.click(

 Gradio UI for interacting with the Anthropic API, Hume TTS API, and ElevenLabs TTS API.
+Users enter a character description, which is processed using Claude by Anthropic to generate text.
+The text is then synthesized into speech using different TTS provider APIs.
 Users can compare the outputs and vote for their favorite in an interactive UI.
 """
 # Local Application Imports
 from src.config import AUDIO_DIR, logger
+from src import constants
 from src.integrations import (
     AnthropicError,
     ElevenLabsError,
     text_to_speech_with_hume,
 )
 from src.theme import CustomTheme
+from src.types import ComparisonType, OptionMap, VotingResults
+from src.utils import validate_character_description_length
 def generate_text(
+    character_description: str,
 ) -> Tuple[Union[str, gr.update], gr.update]:
     """
+    Validates the character_description and generates text using Anthropic API.
     Args:
+        character_description (str): The user-provided text for character description.
     Returns:
         Tuple containing:
         gr.Error: On validation or API errors.
     """
     try:
+        validate_character_description_length(character_description)
     except ValueError as ve:
         logger.warning(f"Validation error: {ve}")
         raise gr.Error(str(ve))
     try:
+        generated_text = generate_text_with_claude(character_description)
         logger.info(f"Generated text ({len(generated_text)} characters).")
         return gr.update(value=generated_text), generated_text
     except AnthropicError as ae:
 def text_to_speech(
+    character_description: str, text: str, generated_text_state: str
 ) -> Tuple[gr.update, gr.update, dict, Union[str, None]]:
     """
     Synthesizes two text to speech outputs, loads the two audio players with the
         - 50% chance to synthesize two Hume outputs.
     Args:
+        character_description (str): The original character_description.
         text (str): The text to synthesize to speech.
     Returns:
         raise gr.Error("Please generate or enter text to synthesize.")
     # Hume AI always included in comparison
+    provider_a = constants.HUME_AI
     # If not using generated text, then only compare Hume to Hume
+    text_modified = text != generated_text_state
+    provider_b: constants.TTSProviderName = (
+        constants.HUME_AI if text_modified else random.choice(constants.TTS_PROVIDERS)
     )
     try:
         with ThreadPoolExecutor(max_workers=2) as executor:
+            future_audio_a = executor.submit(
+                text_to_speech_with_hume, character_description, text
+            )
             match provider_b:
+                case constants.HUME_AI:
+                    comparison_type: ComparisonType = constants.HUME_TO_HUME
                     future_audio_b = executor.submit(
+                        text_to_speech_with_hume, character_description, text
                     )
+                case constants.ELEVENLABS:
+                    comparison_type: ComparisonType = constants.HUME_TO_ELEVENLABS
                     future_audio_b = executor.submit(
+                        text_to_speech_with_elevenlabs, character_description, text
                     )
                 case _:
                     raise ValueError(f"Unsupported provider: {provider_b}")
+            generation_id_a, audio_a = future_audio_a.result()
+            generation_id_b, audio_b = future_audio_b.result()
+        options = [
+            (provider_a, audio_a, generation_id_a),
+            (provider_b, audio_b, generation_id_b),
+        ]
         random.shuffle(options)
+        options_map: OptionMap = {
+            constants.OPTION_A: options[0][0],
+            constants.OPTION_B: options[1][0],
+        }
+        option_a_audio, option_b_audio = options[0][1], options[1][1]
+        option_a_generation_id, option_b_generation_id = options[0][2], options[1][2]
         return (
             gr.update(value=option_a_audio, visible=True, autoplay=True),
             gr.update(value=option_b_audio, visible=True),
             options_map,
             option_b_audio,
+            comparison_type,
+            option_a_generation_id,
+            option_b_generation_id,
+            text_modified,
+            text,
+            character_description,
         )
     except ElevenLabsError as ee:
         logger.error(f"ElevenLabsError while synthesizing speech from text: {str(ee)}")
 def vote(
+    vote_submitted: bool,
+    option_map: OptionMap,
+    selected_button: str,
+    comparison_type: ComparisonType,
+    option_a_generation_id: str,
+    option_b_generation_id: str,
+    text_modified: bool,
+    character_description: str,
+    text: str,
 ) -> Tuple[bool, gr.update, gr.update, gr.update]:
     """
     Handles user voting.
     if not option_map or vote_submitted:
         return gr.skip(), gr.skip(), gr.skip(), gr.skip()
+    option_a_selected = selected_button == constants.VOTE_FOR_OPTION_A
     selected_option, other_option = (
+        (constants.OPTION_A, constants.OPTION_B)
+        if option_a_selected
+        else (constants.OPTION_B, constants.OPTION_A)
     )
     selected_provider = option_map.get(selected_option)
     other_provider = option_map.get(other_option)
     # Build button labels, displaying the provider and voice name, appending the trophy emoji to the selected option.
+    selected_label = f"{selected_provider} {constants.TROPHY_EMOJI}"
     other_label = f"{other_provider}"
+    # Report voting results to be persisted to results DB
+    voting_results: VotingResults = {
+        "comparison_type": comparison_type,
+        "winning_provider": selected_provider,
+        "winning_option": selected_option,
+        "option_a_provider": option_map.get(constants.OPTION_A),
+        "option_b_provider": option_map.get(constants.OPTION_B),
+        "option_a_generation_id": option_a_generation_id,
+        "option_b_generation_id": option_b_generation_id,
+        "character_description": character_description,
+        "text": text,
+        "is_custom_text": text_modified,
+    }
+    # TODO: Currently logging the results until we hook the API for writing results to DB
+    logger.info("Voting results:\n%s", json.dumps(voting_results, indent=4))
     return (
         True,
         (
     return (
         gr.update(value=None),
         gr.update(value=None, autoplay=False),
+        gr.update(value=constants.VOTE_FOR_OPTION_A, variant="secondary"),
+        gr.update(value=constants.VOTE_FOR_OPTION_B, variant="secondary"),
         None,
         None,
         False,
 def build_input_section() -> Tuple[gr.Markdown, gr.Dropdown, gr.Textbox, gr.Button]:
+    """Builds the input section including instructions, sample character description dropdown, character description input, and generate button"""
     instructions = gr.Markdown(
         """
+        1. **Enter or Generate Text:** Type directly in the text box—or enter a character description and click “Generate Text” to auto-populate. Edit as needed.
         2. **Synthesize Speech:** Click “Synthesize Speech” to generate two audio outputs.
         3. **Listen & Compare:** Play back both audio options to hear the differences.
         4. **Vote for Your Favorite:** Click “Vote for Option A” or “Vote for Option B” to cast your vote.
         """
     )
+    sample_character_description_dropdown = gr.Dropdown(
+        choices=list(constants.SAMPLE_CHARACTER_DESCRIPTIONS.keys()),
+        label="Choose a sample character description (or enter your own)",
         value=None,
         interactive=True,
     )
+    character_description_input = gr.Textbox(
+        label="Character Description",
+        placeholder="Enter your character description to be used to generate text and a novel voice...",
         lines=3,
         max_lines=8,
+        max_length=constants.CHARACTER_DESCRIPTION_MAX_LENGTH,
         show_copy_button=True,
     )
     generate_text_button = gr.Button("Generate text", variant="secondary")
     return (
         instructions,
+        sample_character_description_dropdown,
+        character_description_input,
         generate_text_button,
     )
         autoscroll=False,
         lines=3,
         max_lines=8,
+        max_length=constants.CHARACTER_DESCRIPTION_MAX_LENGTH,
         show_copy_button=True,
     )
     synthesize_speech_button = gr.Button("Synthesize speech", variant="primary")
     with gr.Row(equal_height=True):
         option_a_audio_player = gr.Audio(
+            label=constants.OPTION_A, type="filepath", interactive=False
         )
         option_b_audio_player = gr.Audio(
+            label=constants.OPTION_B, type="filepath", interactive=False
         )
     with gr.Row(equal_height=True):
+        vote_button_a = gr.Button(constants.VOTE_FOR_OPTION_A, interactive=False)
+        vote_button_b = gr.Button(constants.VOTE_FOR_OPTION_B, interactive=False)
     return (
         text_input,
         synthesize_speech_button,
         gr.Markdown("# Expressive TTS Arena")
         # Build generate text section
+        (
+            instructions,
+            sample_character_description_dropdown,
+            character_description_input,
+            generate_text_button,
+        ) = build_input_section()
         # Build synthesize speech section
         (
         # --- UI state components ---
+        # Track text used for speech synthesis
+        text_state = gr.State("")
+        # Track character description used for text and voice generation
+        character_description_state = gr.State("")
+        # Track comparison type (which set of providers are being compared)
+        comparison_type_state = gr.State()
+        # Track generation ID for Option A
+        option_a_generation_id_state = gr.State()
+        # Track generation ID for Option B
+        option_b_generation_id_state = gr.State()
+        # Track whether text that was used was generated or modified/custom
+        text_modified_state = gr.State()
         # Track generated text state
         generated_text_state = gr.State("")
         # Track generated audio for option B for playing automatically after option 1 audio finishes
         # --- Register event handlers ---
+        # When a sample character description is chosen, update the character description textbox
+        sample_character_description_dropdown.change(
+            fn=lambda choice: constants.SAMPLE_CHARACTER_DESCRIPTIONS.get(choice, ""),
+            inputs=[sample_character_description_dropdown],
+            outputs=[character_description_input],
         )
         # Generate text button click handler chain:
             outputs=[generate_text_button],
         ).then(
             fn=generate_text,
+            inputs=[character_description_input],
             outputs=[text_input, generated_text_state],
         ).then(
             fn=lambda: gr.update(interactive=True),
             ],
         ).then(
             fn=text_to_speech,
+            inputs=[character_description_input, text_input, generated_text_state],
             outputs=[
                 option_a_audio_player,
                 option_b_audio_player,
                 option_map_state,
                 option_b_audio_state,
+                comparison_type_state,
+                option_a_generation_id_state,
+                option_b_generation_id_state,
+                text_modified_state,
+                text_state,
+                character_description_state,
             ],
         ).then(
             fn=lambda: (
                 vote_button_a,
                 vote_button_b,
                 synthesize_speech_button,
+                comparison_type_state,
+                option_a_generation_id_state,
+                option_b_generation_id_state,
+                text_modified_state,
+                character_description_state,
+                text_state,
             ],
         )
         vote_button_b.click(

src/constants.py CHANGED Viewed

@@ -8,16 +8,19 @@ This module defines global constants used throughout the project.
 from typing import List
 # Third-Party Library Imports
-from src.types import OptionKey, TTSProviderName
 # UI constants
 HUME_AI: TTSProviderName = "Hume AI"
 ELEVENLABS: TTSProviderName = "ElevenLabs"
-TTS_PROVIDERS: List[TTSProviderName]
-PROMPT_MIN_LENGTH: int = 20
-PROMPT_MAX_LENGTH: int = 800
 OPTION_A: OptionKey = "Option A"
 OPTION_B: OptionKey = "Option B"
@@ -26,41 +29,27 @@ VOTE_FOR_OPTION_A: str = "Vote for option A"
 VOTE_FOR_OPTION_B: str = "Vote for option B"
-# A collection of pre-defined prompts categorized by theme, used to provide users with
-# inspiration for generating creative text for expressive TTS.
-SAMPLE_PROMPTS: dict = {
-    "🚀 Dramatic Monologue (Stranded Astronaut)": (
-        "Create a poignant final transmission from a lone astronaut on Mars to mission control. "
-        "Voice: low, measured pace, with subtle tremors of emotion. Content should move from "
-        "awe-struck description of the Martian sunset to peaceful acceptance. Include natural "
-        "pauses for emotional weight. Keep the tone intimate and contemplative, as if speaking "
-        "softly into a radio mic. End with dignified finality."
     ),
-    "📜 Poetic Sonnet (The Passage of Time)": (
-        "Craft a sonnet about time's flow, suitable for measured, resonant delivery. "
-        "Voice: clear, rhythmic, with careful emphasis on key metaphors. Flow from quiet "
-        "reflection to profound realization. Include strategic pauses between quatrains. "
-        "Balance crisp consonants with flowing vowels for musical quality. Maintain consistent "
-        "meter for natural speech rhythm."
     ),
-    "🐱 Whimsical Children's Story (Talking Cat)": (
-        "Tell a playful tale of a curious cat's magical library adventure. "
-        "Voice: bright, energetic, with clear character distinctions. Mix whispered "
-        "conspiracies with excited discoveries. Include dramatic pauses for suspense "
-        "and giggles. Use bouncy rhythm for action scenes, slower pace for wonder. "
-        "End with warm, gentle closure perfect for bedtime."
     ),
-    "🔥 Intense Speech (Freedom & Justice)": (
-        "Deliver a rousing resistance speech that builds from quiet determination to powerful resolve. "
-        "Voice: start controlled and intense, rise to passionate crescendo. Include strategic "
-        "pauses for impact. Mix shorter, punchy phrases with flowing calls to action. "
-        "Use strong consonants and open vowels for projection. End with unshakeable conviction."
     ),
-    "👻 Mysterious Horror Scene (Haunted Lighthouse)": (
-        "Narrate a spine-chilling lighthouse encounter that escalates from unease to revelation. "
-        "Voice: hushed, tense, with subtle dynamic range. Mix whispers with clearer tones. "
-        "Include extended pauses for tension. Use sibilants and soft consonants for "
-        "atmospheric effect. Build rhythm with the lighthouse's beam pattern. End with haunting "
-        "revelation."
     ),
 }

 from typing import List
 # Third-Party Library Imports
+from src.types import ComparisonType, OptionKey, TTSProviderName
 # UI constants
 HUME_AI: TTSProviderName = "Hume AI"
 ELEVENLABS: TTSProviderName = "ElevenLabs"
+TTS_PROVIDERS: List[TTSProviderName] = ["Hume AI", "ElevenLabs"]
+HUME_TO_HUME: ComparisonType = "Hume AI - Hume AI"
+HUME_TO_ELEVENLABS: ComparisonType = "Hume AI - ElevenLabs"
+CHARACTER_DESCRIPTION_MIN_LENGTH: int = 20
+CHARACTER_DESCRIPTION_MAX_LENGTH: int = 800
 OPTION_A: OptionKey = "Option A"
 OPTION_B: OptionKey = "Option B"
 VOTE_FOR_OPTION_B: str = "Vote for option B"
+# A collection of pre-defined character descriptions categorized by theme, used to provide users with
+# inspiration for generating creative text for expressive TTS, and generating novel voices.
+SAMPLE_CHARACTER_DESCRIPTIONS: dict = {
+    "🚀 Stranded Astronaut": (
+        "A lone astronaut whose voice mirrors the silent vastness of space—a low, steady tone imbued with isolation and quiet wonder. "
+        "It carries the measured resolve of someone sending a final transmission, with an undercurrent of wistful melancholy."
     ),
+    "📜 Timeless Poet": (
+        "An ageless poet with a voice that flows like gentle verse—a soft, reflective tone marked by deliberate pauses. "
+        "It speaks with the measured cadence of classic sonnets, evoking both the fragile beauty of time and heartfelt introspection."
     ),
+    "🐱 Whimsical Feline": (
+        "A mischievous cat whose voice is playful yet mysterious—light, quick-witted, and infused with an enchanting purr. "
+        "It hints at secret adventures and hidden charm, balancing exuberance with a subtle, smooth allure."
     ),
+    "🔥 Revolutionary Orator": (
+        "A defiant orator whose voice builds from quiet determination to passionate fervor—a clear, commanding tone that resonates with conviction. "
+        "It starts measured and resolute, then rises to a crescendo of fervor, punctuated by deliberate pauses that emphasize each rallying cry."
     ),
+    "👻 Haunted Keeper": (
+        "A solitary lighthouse keeper with a voice that carries the weight of forgotten storms—a soft, measured tone with an echo of sorrow. "
+        "It speaks as if whispering long-held secrets in the dark, blending quiet melancholy with an air of enduring mystery."
     ),
 }

src/integrations/anthropic_api.py CHANGED Viewed

@@ -98,6 +98,25 @@ Remember: A shorter, complete response is ALWAYS better than a longer, truncated
         """
         return Anthropic(api_key=self.api_key)
 class AnthropicError(Exception):
     """Custom exception for errors related to the Anthropic API."""
@@ -118,12 +137,12 @@ anthropic_config = AnthropicConfig()
     after=after_log(logger, logging.DEBUG),
     reraise=True,
 )
-def generate_text_with_claude(prompt: str) -> str:
     """
     Generates text using Claude (Anthropic LLM) via the Anthropic SDK.
     Args:
-        prompt (str): The input prompt for Claude.
     Returns:
         str: The generated text.
@@ -131,8 +150,10 @@ def generate_text_with_claude(prompt: str) -> str:
     Raises:
         AnthropicError: If there is an error communicating with the Anthropic API.
     """
     logger.debug(
-        f"Generating text with Claude. Prompt length: {len(prompt)} characters."
     )
     response = None

         """
         return Anthropic(api_key=self.api_key)
+    def build_expressive_prompt(self, character_description: str) -> str:
+        """
+        Constructs and returns a prompt based solely on the provided voice description.
+        The returned prompt is intended to instruct Claude to generate expressive text from a character,
+        capturing the character's personality and emotional nuance, without including the system prompt.
+        Args:
+            character_description (str): A description of the character's voice and persona.
+        Returns:
+            str: The prompt to be passed to the Anthropic API.
+        """
+        prompt = (
+            f"Character Description: {character_description}\n\n"
+            "Based on the above character description, please generate a line of dialogue that captures the character's unique personality, emotional depth, and distinctive tone. "
+            "The response should sound like something the character would naturally say, reflecting their background and emotional state, and be fully developed for text-to-speech synthesis."
+        )
+        return prompt
 class AnthropicError(Exception):
     """Custom exception for errors related to the Anthropic API."""
     after=after_log(logger, logging.DEBUG),
     reraise=True,
 )
+def generate_text_with_claude(character_description: str) -> str:
     """
     Generates text using Claude (Anthropic LLM) via the Anthropic SDK.
     Args:
+        character_description (str): The input character description used to assist with generating text with Claude.
     Returns:
         str: The generated text.
     Raises:
         AnthropicError: If there is an error communicating with the Anthropic API.
     """
+    # Build prompt for claude with character description
+    prompt = anthropic_config.build_expressive_prompt(character_description)
     logger.debug(
+        f"Generating text with Claude. Character description length: {len(prompt)} characters."
     )
     response = None

src/integrations/elevenlabs_api.py CHANGED Viewed

@@ -76,16 +76,18 @@ elevenlabs_config = ElevenLabsConfig()
     after=after_log(logger, logging.DEBUG),
     reraise=True,
 )
-def text_to_speech_with_elevenlabs(prompt: str, text: str) -> bytes:
     """
     Synthesizes text to speech using the ElevenLabs TTS API, processes audio data, and writes audio to a file.
     Args:
-        prompt (str): The original user prompt used as the voice description.
         text (str): The text to be synthesized to speech.
     Returns:
-        str: The relative path for the file the synthesized audio was written to.
     Raises:
         ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
@@ -94,12 +96,10 @@ def text_to_speech_with_elevenlabs(prompt: str, text: str) -> bytes:
         f"Synthesizing speech with ElevenLabs. Text length: {len(text)} characters."
     )
-    request_body = {"text": text, "voice_description": prompt}
     try:
         # Synthesize speech using the ElevenLabs SDK
         response = elevenlabs_config.client.text_to_voice.create_previews(
-            voice_description=prompt,
             text=text,
             output_format=elevenlabs_config.output_format,
         )
@@ -117,7 +117,7 @@ def text_to_speech_with_elevenlabs(prompt: str, text: str) -> bytes:
         filename = f"{generated_voice_id}.mp3"
         # Write audio to file and return the relative path
-        return save_base64_audio_to_file(base64_audio, filename)
     except Exception as e:
         logger.exception(f"Error synthesizing speech with ElevenLabs: {e}")

     after=after_log(logger, logging.DEBUG),
     reraise=True,
 )
+def text_to_speech_with_elevenlabs(character_description: str, text: str) -> bytes:
     """
     Synthesizes text to speech using the ElevenLabs TTS API, processes audio data, and writes audio to a file.
     Args:
+        character_description (str): The original user character description used as the voice description.
         text (str): The text to be synthesized to speech.
     Returns:
+        Tuple[None, str]: A tuple containing:
+            - generation_id (None): We do not record the generation ID for ElevenLabs, but return None for uniformity across TTS integrations
+            - file_path (str): The relative path to the file where the synthesized audio was saved.
     Raises:
         ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
         f"Synthesizing speech with ElevenLabs. Text length: {len(text)} characters."
     )
     try:
         # Synthesize speech using the ElevenLabs SDK
         response = elevenlabs_config.client.text_to_voice.create_previews(
+            voice_description=character_description,
             text=text,
             output_format=elevenlabs_config.output_format,
         )
         filename = f"{generated_voice_id}.mp3"
         # Write audio to file and return the relative path
+        return None, save_base64_audio_to_file(base64_audio, filename)
     except Exception as e:
         logger.exception(f"Error synthesizing speech with ElevenLabs: {e}")

src/integrations/hume_api.py CHANGED Viewed

@@ -86,25 +86,29 @@ hume_config = HumeConfig()
     after=after_log(logger, logging.DEBUG),
     reraise=True,
 )
-def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
     """
     Synthesizes text to speech using the Hume TTS API, processes audio data, and writes audio to a file.
     Args:
-        prompt (str): The original user prompt to use as the description for generating the voice.
         text (str): The generated text to be converted to speech.
     Returns:
-        str: The relative path for the file the synthesized audio was written to.
     Raises:
         HumeError: If there is an error communicating with the Hume TTS API or parsing the response.
     """
     logger.debug(
-        f"Processing TTS with Hume. Prompt length: {len(prompt)} characters. Text length: {len(text)} characters."
     )
-    request_body = {"utterances": [{"text": text, "description": prompt}]}
     try:
         # Synthesize speech using the Hume TTS API
@@ -129,7 +133,7 @@ def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
         filename = f"{generation_id}.mp3"
         # Write audio to file and return the relative path
-        return save_base64_audio_to_file(base64_audio, filename)
     except Exception as e:
         logger.exception(f"Error synthesizing speech with Hume: {e}")

     after=after_log(logger, logging.DEBUG),
     reraise=True,
 )
+def text_to_speech_with_hume(character_description: str, text: str) -> bytes:
     """
     Synthesizes text to speech using the Hume TTS API, processes audio data, and writes audio to a file.
     Args:
+        character_description (str): The original user character description to use as the description for generating the voice.
         text (str): The generated text to be converted to speech.
     Returns:
+        Tuple[str, str]: A tuple containing:
+            - generation_id (str): The generation ID returned from the Hume API.
+            - file_path (str): The relative path to the file where the synthesized audio was saved.
     Raises:
         HumeError: If there is an error communicating with the Hume TTS API or parsing the response.
     """
     logger.debug(
+        f"Processing TTS with Hume. Prompt length: {len(character_description)} characters. Text length: {len(text)} characters."
     )
+    request_body = {
+        "utterances": [{"text": text, "description": character_description}]
+    }
     try:
         # Synthesize speech using the Hume TTS API
         filename = f"{generation_id}.mp3"
         # Write audio to file and return the relative path
+        return generation_id, save_base64_audio_to_file(base64_audio, filename)
     except Exception as e:
         logger.exception(f"Error synthesizing speech with Hume: {e}")

src/types.py CHANGED Viewed

@@ -27,7 +27,7 @@ OptionMap = Dict[OptionKey, TTSProviderName]
 class VotingResults(TypedDict):
     """Voting results data structure representing values we want to persist to the votes DB"""
-    comparison_type: str
     winning_provider: TTSProviderName
     winning_option: OptionKey
     option_a_provider: TTSProviderName

 class VotingResults(TypedDict):
     """Voting results data structure representing values we want to persist to the votes DB"""
+    comparison_type: ComparisonType
     winning_provider: TTSProviderName
     winning_option: OptionKey
     option_a_provider: TTSProviderName

src/utils.py CHANGED Viewed

@@ -7,7 +7,7 @@ These functions provide reusable logic to simplify code in other modules.
 Functions:
 - truncate_text: Truncates a string to a specified length with ellipses. (used for logging)
 - validate_env_var: Ensures the presence of a specific environment variable and retrieves its value.
-- validate_prompt_length: Ensures that a prompt does not exceed the specified minimum or maximum length.
 """
 # Standard Library Imports
@@ -16,6 +16,10 @@ import os
 # Local Application Imports
 from src.config import AUDIO_DIR, logger
 def truncate_text(text: str, max_length: int = 50) -> str:
@@ -80,42 +84,42 @@ def validate_env_var(var_name: str) -> str:
     return value
-def validate_prompt_length(prompt: str, max_length: int, min_length: int) -> None:
     """
-    Validates that a prompt is within specified minimum and maximum length limits.
     Args:
-        prompt (str): The input prompt to validate.
-        max_length (int): The maximum allowed length for the prompt.
-        min_length (int): The minimum required length for the prompt.
     Raises:
-        ValueError: If the prompt is empty, too short, or exceeds max_length.
     Example:
-        >>> validate_prompt_length("Hello world", max_length=500, min_length=5)
         # Passes validation
-        >>> validate_prompt_length("", max_length=300, min_length=10)
-        # Raises ValueError: "Prompt must be at least 10 characters long."
     """
-    stripped_prompt = prompt.strip()
-    prompt_length = len(stripped_prompt)
-    logger.debug(f"Prompt length being validated: {prompt_length} characters")
-    if prompt_length < min_length:
         raise ValueError(
-            f"Your prompt is too short. Please enter at least {min_length} characters. "
-            f"(Current length: {prompt_length})"
         )
-    if prompt_length > max_length:
         raise ValueError(
-            f"Your prompt is too long. Please limit it to {max_length} characters. "
-            f"(Current length: {prompt_length})"
         )
     logger.debug(
-        f"Prompt length validation passed for prompt: {truncate_text(stripped_prompt)}"
     )

 Functions:
 - truncate_text: Truncates a string to a specified length with ellipses. (used for logging)
 - validate_env_var: Ensures the presence of a specific environment variable and retrieves its value.
+- validate_character_description_length: Ensures that a voice description does not exceed the specified minimum or maximum length.
 """
 # Standard Library Imports
 # Local Application Imports
 from src.config import AUDIO_DIR, logger
+from src.constants import (
+    CHARACTER_DESCRIPTION_MIN_LENGTH,
+    CHARACTER_DESCRIPTION_MAX_LENGTH,
+)
 def truncate_text(text: str, max_length: int = 50) -> str:
     return value
+def validate_character_description_length(character_description: str) -> None:
     """
+    Validates that a voice description is within specified minimum and maximum length limits.
     Args:
+        character_description (str): The input character description to validate.
     Raises:
+        ValueError: If the character description is empty, too short, or exceeds max length.
     Example:
+        >>> validate_character_description_length("This is a character description.")
         # Passes validation
+        >>> validate_character_description_length("")
+        # Raises ValueError: "Voice Description must be at least 20 characters long."
     """
+    stripped_character_description = character_description.strip()
+    character_description_length = len(stripped_character_description)
+    logger.debug(
+        f"Voice description length being validated: {character_description_length} characters"
+    )
+    if character_description_length < CHARACTER_DESCRIPTION_MIN_LENGTH:
         raise ValueError(
+            f"Your character description is too short. Please enter at least {CHARACTER_DESCRIPTION_MIN_LENGTH} characters. "
+            f"(Current length: {character_description_length})"
         )
+    if character_description_length > CHARACTER_DESCRIPTION_MAX_LENGTH:
         raise ValueError(
+            f"Your character description is too long. Please limit it to {CHARACTER_DESCRIPTION_MAX_LENGTH} characters. "
+            f"(Current length: {character_description_length})"
         )
     logger.debug(
+        f"Character description length validation passed for character_description: {truncate_text(stripped_character_description)}"
     )