new-vision / app.py
m3nnoun's picture
Update app.py
d5920d2 verified
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import soundfile as sf
import io
import tempfile
import os
# Load your fine-tuned model
MODEL_NAME = "m3nnoun/lora_model_semantic"
def load_model():
"""Load the TTS model and tokenizer"""
try:
# Adjust these based on your specific model architecture
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()
return tokenizer, model
except Exception as e:
print(f"Error loading model: {e}")
return None, None
# Initialize model
tokenizer, model = load_model()
def text_to_speech(text, voice_speed=1.0, voice_pitch=1.0):
"""
Convert text to speech using your fine-tuned model
Args:
text (str): Input text to convert to speech
voice_speed (float): Speed of the generated speech
voice_pitch (float): Pitch of the generated speech
Returns:
tuple: (sample_rate, audio_array) for Gradio audio output
"""
if not text.strip():
return None
if tokenizer is None or model is None:
return None
try:
# Tokenize input text
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
# Generate speech with your model
with torch.no_grad():
# This is a placeholder - adjust based on your model's actual interface
# Different TTS models have different forward pass requirements
outputs = model(**inputs)
# Extract audio from model outputs
# This part depends on your model's output format
if hasattr(outputs, 'audio'):
audio = outputs.audio
elif hasattr(outputs, 'waveform'):
audio = outputs.waveform
else:
# If output is different, extract the audio tensor
audio = outputs.last_hidden_state # Adjust based on your model
# Convert to numpy array
if torch.is_tensor(audio):
audio = audio.squeeze().cpu().numpy()
# Apply speed and pitch modifications (basic implementation)
if voice_speed != 1.0:
# Simple speed change by resampling
indices = np.arange(0, len(audio), voice_speed)
audio = np.interp(indices, np.arange(len(audio)), audio)
# Ensure audio is in the right format
audio = np.array(audio, dtype=np.float32)
# Normalize audio
if len(audio) > 0:
audio = audio / np.max(np.abs(audio))
# Return sample rate and audio array
sample_rate = 22050 # Adjust based on your model's sample rate
return sample_rate, audio
except Exception as e:
print(f"Error in text_to_speech: {e}")
return None
def create_interface():
"""Create the Gradio interface"""
with gr.Blocks(title="TTS Model - Text to Speech", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# πŸŽ™οΈ Text-to-Speech Generator
Enter your text below and generate high-quality speech using our fine-tuned TTS model.
"""
)
with gr.Row():
with gr.Column(scale=2):
# Text input
text_input = gr.Textbox(
label="Enter Text",
placeholder="Type the text you want to convert to speech...",
lines=4,
max_lines=10
)
# Voice controls
with gr.Row():
speed_slider = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label="Speech Speed"
)
pitch_slider = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label="Speech Pitch"
)
# Generate button
generate_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg")
with gr.Column(scale=1):
# Audio output
audio_output = gr.Audio(
label="Generated Speech",
type="numpy",
interactive=False
)
# Status/Info
status_text = gr.Textbox(
label="Status",
value="Ready to generate speech",
interactive=False,
lines=2
)
# Example texts
gr.Markdown("### πŸ“ Example Texts")
examples = gr.Examples(
examples=[
["Hello! Welcome to our text-to-speech service."],
["The quick brown fox jumps over the lazy dog."],
["Artificial intelligence is revolutionizing how we interact with technology."],
["Thank you for using our TTS model. We hope you enjoy the generated speech!"]
],
inputs=[text_input],
label="Click on an example to try it"
)
# Event handlers
def generate_and_update_status(text, speed, pitch):
if not text.strip():
return None, "⚠️ Please enter some text to generate speech."
try:
result = text_to_speech(text, speed, pitch)
if result is None:
return None, "❌ Error generating speech. Please try again."
sample_rate, audio = result
return (sample_rate, audio), f"βœ… Speech generated successfully! Duration: {len(audio)/sample_rate:.2f} seconds"
except Exception as e:
return None, f"❌ Error: {str(e)}"
generate_btn.click(
generate_and_update_status,
inputs=[text_input, speed_slider, pitch_slider],
outputs=[audio_output, status_text]
)
# Auto-generate on Enter key (optional)
text_input.submit(
generate_and_update_status,
inputs=[text_input, speed_slider, pitch_slider],
outputs=[audio_output, status_text]
)
return demo
# Create and launch the interface
if __name__ == "__main__":
demo = create_interface()
# Launch the app
demo.launch(
server_name="0.0.0.0", # Important for Hugging Face Spaces
server_port=7860, # Standard port for HF Spaces
share=False, # Set to True if testing locally
show_error=True
)