Spaces:

minhhungg
/

melotts-api

Running

App Files Files Community

melotts-api / app.py

minhhungg

Update app.py

146a956 verified 26 days ago

raw

history blame contribute delete

3.35 kB

	# app.py (for your new MeloTTS space)

	import gradio as gr
	import torch
	import io
	import os
	import numpy as np
	import soundfile as sf
	import base64
	import logging

	# This command is important and should run at the start
	os.system('python -m unidic download')

	from melo.api import TTS

	# --- Setup Logging ---
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# --- Configuration ---
	# We pre-configure everything here.
	LANGUAGE = 'KR'
	# NOTE: A speed of 0.1 is extremely slow. 0.8 is a good starting point. Adjust if needed.
	SPEED = 0.8
	DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
	SPEAKER_ID = 'KR' # Default Korean speaker

	# --- Load Model (this happens only once when the space starts) ---
	MODEL_INSTANCE = None
	try:
	logger.info(f"Loading MeloTTS model for language: {LANGUAGE} on device: {DEVICE}...")
	MODEL_INSTANCE = TTS(language=LANGUAGE, device=DEVICE)
	logger.info("MeloTTS model loaded successfully.")
	except Exception as e:
	logger.exception(f"FATAL: MeloTTS model initialization error: {e}")
	MODEL_INSTANCE = None

	def synthesize(text_to_synthesize):
	"""
	Takes text input and returns a base64 encoded WAV audio data URI string.
	"""
	if not MODEL_INSTANCE:
	raise gr.Error("TTS Model is not available. Cannot process request.")

	if not text_to_synthesize or not text_to_synthesize.strip():
	# Create and return a silent audio data URI for empty input
	silent_audio = np.zeros(int(0.1 * 24000), dtype=np.int16)
	wav_buffer = io.BytesIO()
	sf.write(wav_buffer, silent_audio, 24000, format='WAV')
	wav_buffer.seek(0)
	wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')
	return f"data:audio/wav;base64,{wav_base64}"

	try:
	logger.info(f"Synthesizing for text: '{text_to_synthesize[:80]}...'")

	# Use an in-memory BytesIO object to hold the audio data
	wav_buffer = io.BytesIO()

	# Synthesize audio directly to the buffer
	MODEL_INSTANCE.tts_to_file(
	text_to_synthesize,
	MODEL_INSTANCE.hps.data.spk2id[SPEAKER_ID],
	wav_buffer,
	speed=SPEED,
	format='wav'
	)

	# Reset buffer position to the beginning
	wav_buffer.seek(0)

	# Encode the bytes to base64
	wav_base64 = base64.b64encode(wav_buffer.read()).decode('utf-8')

	logger.info("Synthesis complete.")

	# Return the data URI string our React app expects
	return f"data:audio/wav;base64,{wav_base64}"

	except Exception as e:
	logger.exception(f"TTS synthesis error: {e}")
	raise gr.Error(f"An error occurred during synthesis: {str(e)}")

	# --- Create and Launch the Gradio Interface ---
	# We create a pure API with no complex UI. This is fast and reliable.
	iface = gr.Interface(
	fn=synthesize,
	inputs=gr.Textbox(label="Text to Synthesize"),
	outputs="text", # The API will return a simple text string (our base64 URI)
	title="MeloTTS API",
	description="A simplified API for MeloTTS. Pre-configured for Korean at 0.8 speed.",
	api_name="synthesize"
	)

	# The .queue() helps manage traffic and is recommended for public APIs.
	iface.queue().launch()