Spaces:

nineninesix
/

KaniTTS

Running on Zero

KaniTTS / app.py

Den Pavloff

lines=3

aa6abd6 about 1 month ago

7.51 kB

	import os
	import subprocess
	import sys

	# Fix OMP_NUM_THREADS issue before any imports
	os.environ["OMP_NUM_THREADS"] = "4"

	# Install dependencies programmatically to avoid conflicts
	def setup_dependencies():
	try:
	# Check if already installed
	if os.path.exists('/tmp/deps_installed'):
	return

	print("Installing transformers dev version...")
	subprocess.check_call([
	sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir",
	"git+https://github.com/huggingface/transformers.git"
	])

	# Mark as installed
	with open('/tmp/deps_installed', 'w') as f:
	f.write('done')

	except Exception as e:
	print(f"Dependencies setup error: {e}")

	# Run setup
	setup_dependencies()

	import spaces
	import gradio as gr
	from util import Config, NemoAudioPlayer, KaniModel, Demo
	import numpy as np
	import torch

	# Get HuggingFace token
	token_ = os.getenv('HF_TOKEN')

	# Model configurations
	models_configs = {
	'Base_pretrained_model': Config(),
	'Female_voice': Config(
	model_name='nineninesix/lfm-nano-codec-expresso-ex02-v.0.2',
	temperature=0.2
	),
	'Male_voice': Config(
	model_name='nineninesix/lfm-nano-codec-expresso-ex01-v.0.1',
	temperature=0.2
	)
	}

	# Global variables for models (loaded once)
	player = NemoAudioPlayer(Config())
	demo_examples = Demo()()
	models = {}
	for model_name, config in models_configs.items():
	print(f"Loading {model_name}...")
	models[model_name] = KaniModel(config, player, token_)
	print(f"{model_name} loaded!")
	print("All models loaded!")



	# def initialize_models():
	# """Initialize models globally to avoid reloading"""
	# global models

	# # if player is None:
	# # print("Initializing NeMo Audio Player...")
	# # player = NemoAudioPlayer(Config())
	# # print("NeMo Audio Player initialized!")

	# if not models:
	# print("Loading TTS models...")
	# for model_name, config in models_configs.items():
	# print(f"Loading {model_name}...")
	# models[model_name] = KaniModel(config, player, token_)
	# print(f"{model_name} loaded!")
	# print("All models loaded!")

	@spaces.GPU
	def generate_speech_gpu(text, model_choice):
	"""
	Generate speech from text using the selected model on GPU
	"""
	# Initialize models if not already done
	# initialize_models()

	if not text.strip():
	return None, "Please enter text for speech generation."

	if not model_choice:
	return None, "Please select a model."

	try:
	# Check GPU availability
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	# Get selected model
	selected_model = models[model_choice]

	# Generate audio
	print(f"Generating speech with {model_choice}...")
	audio, _, time_report = selected_model.run_model(text)

	sample_rate = 22050
	print("Speech generation completed!")

	return (sample_rate, audio), time_report #, f"✅ Audio generated successfully using {model_choice} on {device}"

	except Exception as e:
	print(f"Error during generation: {str(e)}")
	return None, f"❌ Error during generation: {str(e)}"

	# def validate_input(text, model_choice):
	# """Quick validation without GPU"""
	# if not text.strip():
	# return "⚠️ Please enter text for speech generation."
	# if not model_choice:
	# return "⚠️ Please select a model."
	# return f"✅ Ready to generate with {model_choice}"

	# Create Gradio interface
	with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
	gr.Markdown("# KaniTTS: Fast and Expressive Speech Generation Model")
	gr.Markdown("Select a model and enter text to generate high-quality speech")

	with gr.Row():
	with gr.Column(scale=1):
	model_dropdown = gr.Dropdown(
	choices=list(models_configs.keys()),
	value=list(models_configs.keys())[0],
	label="Select Model",
	info="Base - default model, Female - female voice, Male - male voice"
	)

	text_input = gr.Textbox(
	label="Enter Text",
	placeholder="Enter text for speech generation...",
	lines=3,
	max_lines=10
	)

	generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")

	# Quick validation button (CPU only)
	# validate_btn = gr.Button("🔍 Validate Input", variant="secondary")

	with gr.Column(scale=1):
	audio_output = gr.Audio(
	label="Generated Speech",
	type="numpy"
	)

	time_report_output = gr.Textbox(
	label="Time Report",
	interactive=False,
	value="Ready to generate speech",
	lines=3
	)

	# GPU generation event
	generate_btn.click(
	fn=generate_speech_gpu,
	inputs=[text_input, model_dropdown],
	outputs=[audio_output, time_report_output]
	)

	# Demo Examples
	gr.Markdown("## 🎯 Demo Examples")

	def play_demo(text):
	return (22050, demo_examples[text]), 'DEMO'

	with gr.Row():
	for text in list(demo_examples.keys())[:4]:
	gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output, time_report_output])

	with gr.Row():
	for text in list(demo_examples.keys())[4:8]:
	gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output, time_report_output])


	# # CPU validation event
	# validate_btn.click(
	# fn=validate_input,
	# inputs=[text_input, model_dropdown],
	# outputs=status_text
	# )

	# # Update status on input change
	# text_input.change(
	# fn=validate_input,
	# inputs=[text_input, model_dropdown],
	# outputs=status_text
	# )

	# Text examples
	# gr.Markdown("### 📝 Text Examples:")
	# examples = [
	# "Hello! How are you today?",
	# "Welcome to the world of artificial intelligence.",
	# "This is a demonstration of neural text-to-speech synthesis.",
	# "Zero GPU makes high-quality speech generation accessible to everyone!"
	# ]

	# gr.Examples(
	# examples=examples,
	# inputs=text_input,
	# label="Click on an example to use it"
	# )

	# # Information section
	# with gr.Accordion("ℹ️ Model Information", open=False):
	# gr.Markdown("""
	# Available Models:
	# - Base Model: Default pre-trained model for general use
	# - Female Voice: Optimized for female voice characteristics
	# - Male Voice: Optimized for male voice characteristics

	# Features:
	# - Powered by NVIDIA NeMo Toolkit
	# - High-quality 22kHz audio output
	# - Zero GPU acceleration for fast inference
	# - Support for long text sequences
	# """)

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	show_error=True
	)