Spaces:
Running
Running
# app.py (for your new MeloTTS space) | |
import gradio as gr | |
import torch | |
import io | |
import os | |
import numpy as np | |
import soundfile as sf | |
import base64 | |
import logging | |
# This command is important and should run at the start | |
os.system('python -m unidic download') | |
from melo.api import TTS | |
# --- Setup Logging --- | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
# --- Configuration --- | |
# We pre-configure everything here. | |
LANGUAGE = 'KR' | |
# NOTE: A speed of 0.1 is extremely slow. 0.8 is a good starting point. Adjust if needed. | |
SPEED = 0.8 | |
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' | |
SPEAKER_ID = 'KR' # Default Korean speaker | |
# --- Load Model (this happens only once when the space starts) --- | |
MODEL_INSTANCE = None | |
try: | |
logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}...") | |
MODEL_INSTANCE = TTS(language=LANGUAGE, device=DEVICE) | |
logger.info("MeloTTS model loaded successfully.") | |
except Exception as e: | |
logger.exception(f"FATAL: MeloTTS model initialization error: {e}") | |
MODEL_INSTANCE = None | |
def synthesize(text_to_synthesize): | |
""" | |
Takes text input and returns a base64 encoded WAV audio data URI string. | |
""" | |
if not MODEL_INSTANCE: | |
raise gr.Error("TTS Model is not available. Cannot process request.") | |
if not text_to_synthesize or not text_to_synthesize.strip(): | |
# Create and return a silent audio data URI for empty input | |
silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16) | |
wav_buffer = io.BytesIO() | |
sf.write(wav_buffer, silent_audio, 24000, format='WAV') | |
wav_buffer.seek(0) | |
wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8') | |
return f"data:audio/wav;base64,{wav_base64}" | |
try: | |
logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'") | |
# Use an in-memory BytesIO object to hold the audio data | |
wav_buffer = io.BytesIO() | |
# Synthesize audio directly to the buffer | |
MODEL_INSTANCE.tts_to_file( | |
text_to_synthesize, | |
MODEL_INSTANCE.hps.data.spk2id[SPEAKER_ID], | |
wav_buffer, | |
speed=SPEED, | |
format='wav' | |
) | |
# Reset buffer position to the beginning | |
wav_buffer.seek(0) | |
# Encode the bytes to base64 | |
wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8') | |
logger.info("Synthesis complete.") | |
# Return the data URI string our React app expects | |
return f"data:audio/wav;base64,{wav_base64}" | |
except Exception as e: | |
logger.exception(f"TTS synthesis error: {e}") | |
raise gr.Error(f"An error occurred during synthesis: {str(e)}") | |
# --- Create and Launch the Gradio Interface --- | |
# We create a pure API with no complex UI. This is fast and reliable. | |
iface = gr.Interface( | |
fn=synthesize, | |
inputs=gr.Textbox(label="Text to Synthesize"), | |
outputs="text", # The API will return a simple text string (our base64 URI) | |
title="MeloTTS API", | |
description="A simplified API for MeloTTS. Pre-configured for Korean at 0.8 speed.", | |
api_name="synthesize" | |
) | |
# The .queue() helps manage traffic and is recommended for public APIs. | |
iface.queue().launch() |