KaniTTS / app.py
Den Pavloff
lines=3
aa6abd6
raw
history blame
7.51 kB
import os
import subprocess
import sys
# Fix OMP_NUM_THREADS issue before any imports
os.environ["OMP_NUM_THREADS"] = "4"
# Install dependencies programmatically to avoid conflicts
def setup_dependencies():
try:
# Check if already installed
if os.path.exists('/tmp/deps_installed'):
return
print("Installing transformers dev version...")
subprocess.check_call([
sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-cache-dir",
"git+https://github.com/huggingface/transformers.git"
])
# Mark as installed
with open('/tmp/deps_installed', 'w') as f:
f.write('done')
except Exception as e:
print(f"Dependencies setup error: {e}")
# Run setup
setup_dependencies()
import spaces
import gradio as gr
from util import Config, NemoAudioPlayer, KaniModel, Demo
import numpy as np
import torch
# Get HuggingFace token
token_ = os.getenv('HF_TOKEN')
# Model configurations
models_configs = {
'Base_pretrained_model': Config(),
'Female_voice': Config(
model_name='nineninesix/lfm-nano-codec-expresso-ex02-v.0.2',
temperature=0.2
),
'Male_voice': Config(
model_name='nineninesix/lfm-nano-codec-expresso-ex01-v.0.1',
temperature=0.2
)
}
# Global variables for models (loaded once)
player = NemoAudioPlayer(Config())
demo_examples = Demo()()
models = {}
for model_name, config in models_configs.items():
print(f"Loading {model_name}...")
models[model_name] = KaniModel(config, player, token_)
print(f"{model_name} loaded!")
print("All models loaded!")
# def initialize_models():
# """Initialize models globally to avoid reloading"""
# global models
# # if player is None:
# # print("Initializing NeMo Audio Player...")
# # player = NemoAudioPlayer(Config())
# # print("NeMo Audio Player initialized!")
# if not models:
# print("Loading TTS models...")
# for model_name, config in models_configs.items():
# print(f"Loading {model_name}...")
# models[model_name] = KaniModel(config, player, token_)
# print(f"{model_name} loaded!")
# print("All models loaded!")
@spaces.GPU
def generate_speech_gpu(text, model_choice):
"""
Generate speech from text using the selected model on GPU
"""
# Initialize models if not already done
# initialize_models()
if not text.strip():
return None, "Please enter text for speech generation."
if not model_choice:
return None, "Please select a model."
try:
# Check GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Get selected model
selected_model = models[model_choice]
# Generate audio
print(f"Generating speech with {model_choice}...")
audio, _, time_report = selected_model.run_model(text)
sample_rate = 22050
print("Speech generation completed!")
return (sample_rate, audio), time_report #, f"βœ… Audio generated successfully using {model_choice} on {device}"
except Exception as e:
print(f"Error during generation: {str(e)}")
return None, f"❌ Error during generation: {str(e)}"
# def validate_input(text, model_choice):
# """Quick validation without GPU"""
# if not text.strip():
# return "⚠️ Please enter text for speech generation."
# if not model_choice:
# return "⚠️ Please select a model."
# return f"βœ… Ready to generate with {model_choice}"
# Create Gradio interface
with gr.Blocks(title="KaniTTS - Text to Speech", theme=gr.themes.Default()) as demo:
gr.Markdown("# KaniTTS: Fast and Expressive Speech Generation Model")
gr.Markdown("Select a model and enter text to generate high-quality speech")
with gr.Row():
with gr.Column(scale=1):
model_dropdown = gr.Dropdown(
choices=list(models_configs.keys()),
value=list(models_configs.keys())[0],
label="Select Model",
info="Base - default model, Female - female voice, Male - male voice"
)
text_input = gr.Textbox(
label="Enter Text",
placeholder="Enter text for speech generation...",
lines=3,
max_lines=10
)
generate_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg")
# Quick validation button (CPU only)
# validate_btn = gr.Button("πŸ” Validate Input", variant="secondary")
with gr.Column(scale=1):
audio_output = gr.Audio(
label="Generated Speech",
type="numpy"
)
time_report_output = gr.Textbox(
label="Time Report",
interactive=False,
value="Ready to generate speech",
lines=3
)
# GPU generation event
generate_btn.click(
fn=generate_speech_gpu,
inputs=[text_input, model_dropdown],
outputs=[audio_output, time_report_output]
)
# Demo Examples
gr.Markdown("## 🎯 Demo Examples")
def play_demo(text):
return (22050, demo_examples[text]), 'DEMO'
with gr.Row():
for text in list(demo_examples.keys())[:4]:
gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output, time_report_output])
with gr.Row():
for text in list(demo_examples.keys())[4:8]:
gr.Button(text).click(lambda t=text: play_demo(t), outputs=[audio_output, time_report_output])
# # CPU validation event
# validate_btn.click(
# fn=validate_input,
# inputs=[text_input, model_dropdown],
# outputs=status_text
# )
# # Update status on input change
# text_input.change(
# fn=validate_input,
# inputs=[text_input, model_dropdown],
# outputs=status_text
# )
# Text examples
# gr.Markdown("### πŸ“ Text Examples:")
# examples = [
# "Hello! How are you today?",
# "Welcome to the world of artificial intelligence.",
# "This is a demonstration of neural text-to-speech synthesis.",
# "Zero GPU makes high-quality speech generation accessible to everyone!"
# ]
# gr.Examples(
# examples=examples,
# inputs=text_input,
# label="Click on an example to use it"
# )
# # Information section
# with gr.Accordion("ℹ️ Model Information", open=False):
# gr.Markdown("""
# **Available Models:**
# - **Base Model**: Default pre-trained model for general use
# - **Female Voice**: Optimized for female voice characteristics
# - **Male Voice**: Optimized for male voice characteristics
# **Features:**
# - Powered by NVIDIA NeMo Toolkit
# - High-quality 22kHz audio output
# - Zero GPU acceleration for fast inference
# - Support for long text sequences
# """)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
show_error=True
)