Spaces:

HumeAI
/

expressive-tts-arena

Running

App Files Files Community

zach commited on Feb 5

Commit

0e508c8

1 Parent(s): 83c6aee

Refactor tts integration functions to write audio to file and return file path, audio players to play mp3 file written to temp folder, fix audioplayer loading, remove unused imports

Browse files

Files changed (9) hide show

.gitignore +1 -1
src/app.py +9 -20
src/config.py +6 -1
src/constants.py +0 -1
src/integrations/anthropic_api.py +16 -13
src/integrations/elevenlabs_api.py +13 -9
src/integrations/hume_api.py +30 -22
src/types.py +1 -1
src/utils.py +43 -1

.gitignore CHANGED Viewed

@@ -38,4 +38,4 @@ Thumbs.db
 *.cache
 # Temp files
-src/static/audio/

 *.cache
 # Temp files
+static/audio/*

src/app.py CHANGED Viewed

@@ -11,13 +11,14 @@ Users can compare the outputs and vote for their favorite in an interactive UI.
 # Standard Library Imports
 from concurrent.futures import ThreadPoolExecutor
 import random
 from typing import Union, Tuple
 # Third-Party Library Imports
 import gradio as gr
 # Local Application Imports
-from src.config import logger
 from src.constants import (
     ELEVENLABS,
     HUME_AI,
@@ -27,7 +28,6 @@ from src.constants import (
     PROMPT_MIN_LENGTH,
     SAMPLE_PROMPTS,
     TROPHY_EMOJI,
-    UNKNOWN_PROVIDER,
     VOTE_FOR_OPTION_A,
     VOTE_FOR_OPTION_B,
 )
@@ -41,7 +41,7 @@ from src.integrations import (
 )
 from src.theme import CustomTheme
 from src.types import OptionMap
-from src.utils import truncate_text, validate_prompt_length
 def generate_text(
@@ -130,13 +130,7 @@ def text_to_speech(
             audio_a = future_audio_a.result()
             audio_b = future_audio_b.result()
-        logger.info(
-            f"TTS generated: {provider_a}={len(audio_a)} bytes, {provider_b}={len(audio_b)} bytes"
-        )
-        options = [
-            (audio_a, provider_a),
-            (audio_b, provider_b),
-        ]
         random.shuffle(options)
         option_a_audio, option_b_audio = options[0][0], options[1][0]
         options_map: OptionMap = {OPTION_A: options[0][1], OPTION_B: options[1][1]}
@@ -444,16 +438,11 @@ def build_gradio_interface() -> gr.Blocks:
             ],
         )
-        # Auto-play second audio after first finishes (Workaround to play audio back-to-back)
-        # Audio player A stop event handler chain:
-        # 1. Clear the audio player A
-        # 2. Load audio player A with audio and set auto play to True
         option_a_audio_player.stop(
-            fn=lambda _: gr.update(value=None),
-            inputs=[],
-            outputs=[option_b_audio_player],
-        ).then(
-            fn=lambda audio: gr.update(value=audio, autoplay=True),
             inputs=[option_b_audio_state],
             outputs=[option_b_audio_player],
         )
@@ -476,4 +465,4 @@ def build_gradio_interface() -> gr.Blocks:
 if __name__ == "__main__":
     logger.info("Launching TTS Arena Gradio app...")
     demo = build_gradio_interface()
-    demo.launch()

 # Standard Library Imports
 from concurrent.futures import ThreadPoolExecutor
 import random
+import time
 from typing import Union, Tuple
 # Third-Party Library Imports
 import gradio as gr
 # Local Application Imports
+from src.config import AUDIO_DIR, logger
 from src.constants import (
     ELEVENLABS,
     HUME_AI,
     PROMPT_MIN_LENGTH,
     SAMPLE_PROMPTS,
     TROPHY_EMOJI,
     VOTE_FOR_OPTION_A,
     VOTE_FOR_OPTION_B,
 )
 )
 from src.theme import CustomTheme
 from src.types import OptionMap
+from src.utils import validate_prompt_length
 def generate_text(
             audio_a = future_audio_a.result()
             audio_b = future_audio_b.result()
+        options = [(audio_a, provider_a), (audio_b, provider_b)]
         random.shuffle(options)
         option_a_audio, option_b_audio = options[0][0], options[1][0]
         options_map: OptionMap = {OPTION_A: options[0][1], OPTION_B: options[1][1]}
             ],
         )
+        # Reload audio player B with audio and set autoplay to True (workaround to play audio back-to-back)
         option_a_audio_player.stop(
+            fn=lambda current_audio_path: gr.update(
+                value=f"{current_audio_path}?t={int(time.time())}", autoplay=True
+            ),
             inputs=[option_b_audio_state],
             outputs=[option_b_audio_player],
         )
 if __name__ == "__main__":
     logger.info("Launching TTS Arena Gradio app...")
     demo = build_gradio_interface()
+    demo.launch(allowed_paths=[AUDIO_DIR])

src/config.py CHANGED Viewed

@@ -35,6 +35,11 @@ logging.basicConfig(
 )
 logger: logging.Logger = logging.getLogger("tts_arena")
 logger.info(f'Debug mode is {"enabled" if DEBUG else "disabled"}.')
 if DEBUG:
     logger.debug(f"DEBUG mode enabled.")

 )
 logger: logging.Logger = logging.getLogger("tts_arena")
 logger.info(f'Debug mode is {"enabled" if DEBUG else "disabled"}.')
 if DEBUG:
     logger.debug(f"DEBUG mode enabled.")
+# Define the directory for audio files relative to the project root
+AUDIO_DIR = os.path.join(os.getcwd(), "static", "audio")
+os.makedirs(AUDIO_DIR, exist_ok=True)
+logger.info(f"Audio directory set to {AUDIO_DIR}")

src/constants.py CHANGED Viewed

@@ -11,7 +11,6 @@ from src.types import OptionKey, TTSProviderName
 # UI constants
 HUME_AI: TTSProviderName = "Hume AI"
 ELEVENLABS: TTSProviderName = "ElevenLabs"
-UNKNOWN_PROVIDER: TTSProviderName = "Unknown"
 PROMPT_MIN_LENGTH: int = 20
 PROMPT_MAX_LENGTH: int = 800

 # UI constants
 HUME_AI: TTSProviderName = "Hume AI"
 ELEVENLABS: TTSProviderName = "ElevenLabs"
 PROMPT_MIN_LENGTH: int = 20
 PROMPT_MAX_LENGTH: int = 800

src/integrations/anthropic_api.py CHANGED Viewed

@@ -40,10 +40,23 @@ class AnthropicConfig:
     api_key: str = validate_env_var("ANTHROPIC_API_KEY")
     model: ModelParam = "claude-3-5-sonnet-latest"
     max_tokens: int = 150
-    system_prompt: str = f"""You are an expert at generating micro-content optimized for text-to-speech synthesis. Your absolute priority is delivering complete, untruncated responses within strict length limits.
 CRITICAL LENGTH CONSTRAINTS:
-Maximum length: {max_tokens} tokens (approximately 400 characters)
 You MUST complete all thoughts and sentences
 Responses should be 25% shorter than you initially plan
 Never exceed 400 characters total
@@ -73,17 +86,7 @@ Resolution (75-100 characters)
 MANDATORY: If you find yourself reaching 300 characters, immediately begin your conclusion regardless of where you are in the narrative.
 Remember: A shorter, complete response is ALWAYS better than a longer, truncated one."""
-    def __post_init__(self):
-        # Validate that required attributes are set
-        if not self.api_key:
-            raise ValueError("Anthropic API key is not set.")
-        if not self.model:
-            raise ValueError("Anthropic Model is not set.")
-        if not self.max_tokens:
-            raise ValueError("Anthropic Max Tokens is not set.")
-        if not self.system_prompt:
-            raise ValueError("Anthropic System Prompt is not set.")
     @property
     def client(self) -> Anthropic:

     api_key: str = validate_env_var("ANTHROPIC_API_KEY")
     model: ModelParam = "claude-3-5-sonnet-latest"
     max_tokens: int = 150
+    system_prompt: Optional[str] = (
+        None  # system prompt is set post initialization, since self.max_tokens is leveraged in the prompt.
+    )
+    def __post_init__(self):
+        # Validate that required attributes are set
+        if not self.api_key:
+            raise ValueError("Anthropic API key is not set.")
+        if not self.model:
+            raise ValueError("Anthropic Model is not set.")
+        if not self.max_tokens:
+            raise ValueError("Anthropic Max Tokens is not set.")
+        if self.system_prompt is None:
+            system_prompt: str = f"""You are an expert at generating micro-content optimized for text-to-speech synthesis. Your absolute priority is delivering complete, untruncated responses within strict length limits.
 CRITICAL LENGTH CONSTRAINTS:
+Maximum length: {self.max_tokens} tokens (approximately 400 characters)
 You MUST complete all thoughts and sentences
 Responses should be 25% shorter than you initially plan
 Never exceed 400 characters total
 MANDATORY: If you find yourself reaching 300 characters, immediately begin your conclusion regardless of where you are in the narrative.
 Remember: A shorter, complete response is ALWAYS better than a longer, truncated one."""
+            object.__setattr__(self, "system_prompt", system_prompt)
     @property
     def client(self) -> Anthropic:

src/integrations/elevenlabs_api.py CHANGED Viewed

@@ -20,20 +20,18 @@ Functions:
 """
 # Standard Library Imports
-import base64
 from dataclasses import dataclass
-from enum import Enum
 import logging
 import random
-from typing import Literal, Optional, Tuple
 # Third-Party Library Imports
-from elevenlabs import ElevenLabs
 from tenacity import retry, stop_after_attempt, wait_fixed, before_log, after_log
 # Local Application Imports
 from src.config import logger
-from src.utils import validate_env_var
 @dataclass(frozen=True)
@@ -41,6 +39,7 @@ class ElevenLabsConfig:
     """Immutable configuration for interacting with the ElevenLabs TTS API."""
     api_key: str = validate_env_var("ELEVENLABS_API_KEY")
     def __post_init__(self):
         # Validate that required attributes are set
@@ -79,14 +78,14 @@ elevenlabs_config = ElevenLabsConfig()
 )
 def text_to_speech_with_elevenlabs(prompt: str, text: str) -> bytes:
     """
-    Synthesizes text to speech using the ElevenLabs TTS API.
     Args:
         prompt (str): The original user prompt used as the voice description.
         text (str): The text to be synthesized to speech.
     Returns:
-        bytes: The raw binary audio data for playback.
     Raises:
         ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
@@ -102,6 +101,7 @@ def text_to_speech_with_elevenlabs(prompt: str, text: str) -> bytes:
         response = elevenlabs_config.client.text_to_voice.create_previews(
             voice_description=prompt,
             text=text,
         )
         previews = response.previews
@@ -110,10 +110,14 @@ def text_to_speech_with_elevenlabs(prompt: str, text: str) -> bytes:
             logger.error(msg)
             raise ElevenLabsError(message=msg)
         preview = random.choice(previews)
         base64_audio = preview.audio_base_64
-        audio = base64.b64decode(base64_audio)
-        return audio
     except Exception as e:
         logger.exception(f"Error synthesizing speech with ElevenLabs: {e}")

 """
 # Standard Library Imports
 from dataclasses import dataclass
 import logging
 import random
+from typing import Optional
 # Third-Party Library Imports
+from elevenlabs import ElevenLabs, TextToVoiceCreatePreviewsRequestOutputFormat
 from tenacity import retry, stop_after_attempt, wait_fixed, before_log, after_log
 # Local Application Imports
 from src.config import logger
+from src.utils import save_base64_audio_to_file, validate_env_var
 @dataclass(frozen=True)
     """Immutable configuration for interacting with the ElevenLabs TTS API."""
     api_key: str = validate_env_var("ELEVENLABS_API_KEY")
+    output_format: TextToVoiceCreatePreviewsRequestOutputFormat = "mp3_44100_128"
     def __post_init__(self):
         # Validate that required attributes are set
 )
 def text_to_speech_with_elevenlabs(prompt: str, text: str) -> bytes:
     """
+    Synthesizes text to speech using the ElevenLabs TTS API, processes audio data, and writes audio to a file.
     Args:
         prompt (str): The original user prompt used as the voice description.
         text (str): The text to be synthesized to speech.
     Returns:
+        str: The relative path for the file the synthesized audio was written to.
     Raises:
         ElevenLabsError: If there is an error communicating with the ElevenLabs API or processing the response.
         response = elevenlabs_config.client.text_to_voice.create_previews(
             voice_description=prompt,
             text=text,
+            output_format=elevenlabs_config.output_format,
         )
         previews = response.previews
             logger.error(msg)
             raise ElevenLabsError(message=msg)
+        # Extract the base64 encoded audio and generated voice ID from the preview
         preview = random.choice(previews)
+        generated_voice_id = preview.generated_voice_id
         base64_audio = preview.audio_base_64
+        filename = f"{generated_voice_id}.mp3"
+        # Write audio to file and return the relative path
+        return save_base64_audio_to_file(base64_audio, filename)
     except Exception as e:
         logger.exception(f"Error synthesizing speech with ElevenLabs: {e}")

src/integrations/hume_api.py CHANGED Viewed

@@ -19,11 +19,11 @@ Functions:
 """
 # Standard Library Imports
-import base64
 from dataclasses import dataclass
 import logging
 import random
-from typing import List, Literal, Optional, Tuple
 # Third-Party Library Imports
 import requests
@@ -31,7 +31,11 @@ from tenacity import retry, stop_after_attempt, wait_fixed, before_log, after_lo
 # Local Application Imports
 from src.config import logger
-from src.utils import validate_env_var, truncate_text
 @dataclass(frozen=True)
@@ -41,6 +45,7 @@ class HumeConfig:
     api_key: str = validate_env_var("HUME_API_KEY")
     url: str = "https://test-api.hume.ai/v0/tts/octave"
     headers: dict = None
     def __post_init__(self):
         # Validate required attributes
@@ -48,6 +53,8 @@ class HumeConfig:
             raise ValueError("Hume API key is not set.")
         if not self.url:
             raise ValueError("Hume TTS endpoint URL is not set.")
         # Set headers dynamically after validation
         object.__setattr__(
@@ -81,14 +88,14 @@ hume_config = HumeConfig()
 )
 def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
     """
-    Synthesizes text to speech using the Hume TTS API and processes raw binary audio data.
     Args:
         prompt (str): The original user prompt to use as the description for generating the voice.
         text (str): The generated text to be converted to speech.
     Returns:
-        bytes: The raw binary audio data for playback.
     Raises:
         HumeError: If there is an error communicating with the Hume TTS API or parsing the response.
@@ -108,24 +115,25 @@ def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
         )
         response.raise_for_status()
         response_data = response.json()
-    except requests.RequestException as re:
-        request_error_msg = f"Error communicating with Hume TTS API: {re}"
-        logger.exception(request_error_msg)
-        raise HumeError(request_error_msg) from re
-    try:
-        # Safely extract the generation result from the response JSON
-        generations = response_data.get("generations", [])
         if not generations:
-            logger.error("Missing 'audio' data in the response.")
-            raise HumeError("Missing audio data in response from Hume TTS API")
         generation = generations[0]
         base64_audio = generation.get("audio")
-        # Decode base64 encoded audio
-        audio = base64.b64decode(base64_audio)
-    except (KeyError, TypeError, base64.binascii.Error) as ae:
-        logger.exception(f"Error processing audio data: {ae}")
-        raise HumeError(f"Error processing audio data from Hume TTS API: {ae}") from ae
-    logger.info(f"Received audio data from Hume ({len(audio)} bytes).")
-    return audio

 """
 # Standard Library Imports
 from dataclasses import dataclass
 import logging
+import os
 import random
+from typing import Literal, Optional
 # Third-Party Library Imports
 import requests
 # Local Application Imports
 from src.config import logger
+from src.utils import save_base64_audio_to_file, validate_env_var
+HumeSupportedFileFormat = Literal["mp3", "pcm", "wav"]
+""" Support audio file formats for the Hume TTS API"""
 @dataclass(frozen=True)
     api_key: str = validate_env_var("HUME_API_KEY")
     url: str = "https://test-api.hume.ai/v0/tts/octave"
     headers: dict = None
+    file_format: HumeSupportedFileFormat = "mp3"
     def __post_init__(self):
         # Validate required attributes
             raise ValueError("Hume API key is not set.")
         if not self.url:
             raise ValueError("Hume TTS endpoint URL is not set.")
+        if not self.file_format:
+            raise ValueError("Hume TTS file format is not set.")
         # Set headers dynamically after validation
         object.__setattr__(
 )
 def text_to_speech_with_hume(prompt: str, text: str) -> bytes:
     """
+    Synthesizes text to speech using the Hume TTS API, processes audio data, and writes audio to a file.
     Args:
         prompt (str): The original user prompt to use as the description for generating the voice.
         text (str): The generated text to be converted to speech.
     Returns:
+        str: The relative path for the file the synthesized audio was written to.
     Raises:
         HumeError: If there is an error communicating with the Hume TTS API or parsing the response.
         )
         response.raise_for_status()
         response_data = response.json()
+        generations = response_data.get("generations")
         if not generations:
+            msg = "No generations returned by Hume API."
+            logger.error(msg)
+            raise HumeError(msg)
+        # Extract the base64 encoded audio and generation ID from the generation
         generation = generations[0]
+        generation_id = generation.get("generation_id")
         base64_audio = generation.get("audio")
+        filename = f"{generation_id}.mp3"
+        # Write audio to file and return the relative path
+        return save_base64_audio_to_file(base64_audio, filename)
+    except Exception as e:
+        logger.exception(f"Error synthesizing speech with Hume: {e}")
+        raise HumeError(
+            message=f"Failed to synthesize speech with Hume: {e}",
+            original_exception=e,
+        ) from e

src/types.py CHANGED Viewed

@@ -7,7 +7,7 @@ has a consistent structure including both the provider and the associated voice.
 """
 # Standard Library Imports
-from typing import TypedDict, Literal, Dict
 TTSProviderName = Literal["Hume AI", "ElevenLabs"]

 """
 # Standard Library Imports
+from typing import Literal, Dict
 TTSProviderName = Literal["Hume AI", "ElevenLabs"]

src/utils.py CHANGED Viewed

@@ -11,10 +11,11 @@ Functions:
 """
 # Standard Library Imports
 import os
 # Local Application Imports
-from src.config import logger
 def truncate_text(text: str, max_length: int = 50) -> str:
@@ -116,3 +117,44 @@ def validate_prompt_length(prompt: str, max_length: int, min_length: int) -> Non
     logger.debug(
         f"Prompt length validation passed for prompt: {truncate_text(stripped_prompt)}"
     )

 """
 # Standard Library Imports
+import base64
 import os
 # Local Application Imports
+from src.config import AUDIO_DIR, logger
 def truncate_text(text: str, max_length: int = 50) -> str:
     logger.debug(
         f"Prompt length validation passed for prompt: {truncate_text(stripped_prompt)}"
     )
+def save_base64_audio_to_file(base64_audio: str, filename: str) -> str:
+    """
+    Decode a base64-encoded audio string and write the resulting binary data to a file
+    within the preconfigured AUDIO_DIR directory. This function verifies the file was created,
+    logs the absolute and relative file paths, and returns a path relative to the current
+    working directory (which is what Gradio requires to serve static files).
+    Args:
+        base64_audio (str): The base64-encoded string representing the audio data.
+        filename (str): The name of the file (including extension, e.g.,
+                        'b4a335da-9786-483a-b0a5-37e6e4ad5fd1.mp3') where the decoded
+                        audio will be saved.
+    Returns:
+        str: The relative file path to the saved audio file.
+    Raises:
+        Exception: Propagates any exceptions raised during the decoding or file I/O operations.
+    """
+    # Decode the base64-encoded audio into binary data.
+    audio_bytes = base64.b64decode(base64_audio)
+    # Construct the full absolute file path within the AUDIO_DIR directory.
+    file_path = os.path.join(AUDIO_DIR, filename)
+    # Write the binary audio data to the file.
+    with open(file_path, "wb") as audio_file:
+        audio_file.write(audio_bytes)
+    # Verify that the file was created.
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"Audio file was not created at {file_path}")
+    # Compute a relative path for Gradio to serve (relative to the project root).
+    relative_path = os.path.relpath(file_path, os.getcwd())
+    logger.debug(f"Audio file absolute path: {file_path}")
+    logger.debug(f"Audio file relative path: {relative_path}")
+    return relative_path