melotts-api / app.py
minhhungg's picture
Update app.py
146a956 verified
# app.py (for your new MeloTTS space)
import gradio as gr
import torch
import io
import os
import numpy as np
import soundfile as sf
import base64
import logging
# This command is important and should run at the start
os.system('python -m unidic download')
from melo.api import TTS
# --- Setup Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# --- Configuration ---
# We pre-configure everything here.
LANGUAGE = 'KR'
# NOTE: A speed of 0.1 is extremely slow. 0.8 is a good starting point. Adjust if needed.
SPEED = 0.8
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
SPEAKER_ID = 'KR' # Default Korean speaker
# --- Load Model (this happens only once when the space starts) ---
MODEL_INSTANCE = None
try:
logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}...")
MODEL_INSTANCE = TTS(language=LANGUAGE, device=DEVICE)
logger.info("MeloTTS model loaded successfully.")
except Exception as e:
logger.exception(f"FATAL: MeloTTS model initialization error: {e}")
MODEL_INSTANCE = None
def synthesize(text_to_synthesize):
"""
Takes text input and returns a base64 encoded WAV audio data URI string.
"""
if not MODEL_INSTANCE:
raise gr.Error("TTS Model is not available. Cannot process request.")
if not text_to_synthesize or not text_to_synthesize.strip():
# Create and return a silent audio data URI for empty input
silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16)
wav_buffer = io.BytesIO()
sf.write(wav_buffer, silent_audio, 24000, format='WAV')
wav_buffer.seek(0)
wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
return f"data:audio/wav;base64,{wav_base64}"
try:
logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'")
# Use an in-memory BytesIO object to hold the audio data
wav_buffer = io.BytesIO()
# Synthesize audio directly to the buffer
MODEL_INSTANCE.tts_to_file(
text_to_synthesize,
MODEL_INSTANCE.hps.data.spk2id[SPEAKER_ID],
wav_buffer,
speed=SPEED,
format='wav'
)
# Reset buffer position to the beginning
wav_buffer.seek(0)
# Encode the bytes to base64
wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
logger.info("Synthesis complete.")
# Return the data URI string our React app expects
return f"data:audio/wav;base64,{wav_base64}"
except Exception as e:
logger.exception(f"TTS synthesis error: {e}")
raise gr.Error(f"An error occurred during synthesis: {str(e)}")
# --- Create and Launch the Gradio Interface ---
# We create a pure API with no complex UI. This is fast and reliable.
iface = gr.Interface(
fn=synthesize,
inputs=gr.Textbox(label="Text to Synthesize"),
outputs="text", # The API will return a simple text string (our base64 URI)
title="MeloTTS API",
description="A simplified API for MeloTTS. Pre-configured for Korean at 0.8 speed.",
api_name="synthesize"
)
# The .queue() helps manage traffic and is recommended for public APIs.
iface.queue().launch()