Spaces:
Sleeping
Sleeping
import gradio as gr | |
import torch | |
from transformers import AutoTokenizer, AutoModel | |
import numpy as np | |
import soundfile as sf | |
import io | |
import tempfile | |
import os | |
# Load your fine-tuned model | |
MODEL_NAME = "m3nnoun/lora_model_semantic" | |
def load_model(): | |
"""Load the TTS model and tokenizer""" | |
try: | |
# Adjust these based on your specific model architecture | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
model = AutoModel.from_pretrained(MODEL_NAME) | |
model.eval() | |
return tokenizer, model | |
except Exception as e: | |
print(f"Error loading model: {e}") | |
return None, None | |
# Initialize model | |
tokenizer, model = load_model() | |
def text_to_speech(text, voice_speed=1.0, voice_pitch=1.0): | |
""" | |
Convert text to speech using your fine-tuned model | |
Args: | |
text (str): Input text to convert to speech | |
voice_speed (float): Speed of the generated speech | |
voice_pitch (float): Pitch of the generated speech | |
Returns: | |
tuple: (sample_rate, audio_array) for Gradio audio output | |
""" | |
if not text.strip(): | |
return None | |
if tokenizer is None or model is None: | |
return None | |
try: | |
# Tokenize input text | |
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) | |
# Generate speech with your model | |
with torch.no_grad(): | |
# This is a placeholder - adjust based on your model's actual interface | |
# Different TTS models have different forward pass requirements | |
outputs = model(**inputs) | |
# Extract audio from model outputs | |
# This part depends on your model's output format | |
if hasattr(outputs, 'audio'): | |
audio = outputs.audio | |
elif hasattr(outputs, 'waveform'): | |
audio = outputs.waveform | |
else: | |
# If output is different, extract the audio tensor | |
audio = outputs.last_hidden_state # Adjust based on your model | |
# Convert to numpy array | |
if torch.is_tensor(audio): | |
audio = audio.squeeze().cpu().numpy() | |
# Apply speed and pitch modifications (basic implementation) | |
if voice_speed != 1.0: | |
# Simple speed change by resampling | |
indices = np.arange(0, len(audio), voice_speed) | |
audio = np.interp(indices, np.arange(len(audio)), audio) | |
# Ensure audio is in the right format | |
audio = np.array(audio, dtype=np.float32) | |
# Normalize audio | |
if len(audio) > 0: | |
audio = audio / np.max(np.abs(audio)) | |
# Return sample rate and audio array | |
sample_rate = 22050 # Adjust based on your model's sample rate | |
return sample_rate, audio | |
except Exception as e: | |
print(f"Error in text_to_speech: {e}") | |
return None | |
def create_interface(): | |
"""Create the Gradio interface""" | |
with gr.Blocks(title="TTS Model - Text to Speech", theme=gr.themes.Soft()) as demo: | |
gr.Markdown( | |
""" | |
# ποΈ Text-to-Speech Generator | |
Enter your text below and generate high-quality speech using our fine-tuned TTS model. | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
# Text input | |
text_input = gr.Textbox( | |
label="Enter Text", | |
placeholder="Type the text you want to convert to speech...", | |
lines=4, | |
max_lines=10 | |
) | |
# Voice controls | |
with gr.Row(): | |
speed_slider = gr.Slider( | |
minimum=0.5, | |
maximum=2.0, | |
value=1.0, | |
step=0.1, | |
label="Speech Speed" | |
) | |
pitch_slider = gr.Slider( | |
minimum=0.5, | |
maximum=2.0, | |
value=1.0, | |
step=0.1, | |
label="Speech Pitch" | |
) | |
# Generate button | |
generate_btn = gr.Button("π΅ Generate Speech", variant="primary", size="lg") | |
with gr.Column(scale=1): | |
# Audio output | |
audio_output = gr.Audio( | |
label="Generated Speech", | |
type="numpy", | |
interactive=False | |
) | |
# Status/Info | |
status_text = gr.Textbox( | |
label="Status", | |
value="Ready to generate speech", | |
interactive=False, | |
lines=2 | |
) | |
# Example texts | |
gr.Markdown("### π Example Texts") | |
examples = gr.Examples( | |
examples=[ | |
["Hello! Welcome to our text-to-speech service."], | |
["The quick brown fox jumps over the lazy dog."], | |
["Artificial intelligence is revolutionizing how we interact with technology."], | |
["Thank you for using our TTS model. We hope you enjoy the generated speech!"] | |
], | |
inputs=[text_input], | |
label="Click on an example to try it" | |
) | |
# Event handlers | |
def generate_and_update_status(text, speed, pitch): | |
if not text.strip(): | |
return None, "β οΈ Please enter some text to generate speech." | |
try: | |
result = text_to_speech(text, speed, pitch) | |
if result is None: | |
return None, "β Error generating speech. Please try again." | |
sample_rate, audio = result | |
return (sample_rate, audio), f"β Speech generated successfully! Duration: {len(audio)/sample_rate:.2f} seconds" | |
except Exception as e: | |
return None, f"β Error: {str(e)}" | |
generate_btn.click( | |
generate_and_update_status, | |
inputs=[text_input, speed_slider, pitch_slider], | |
outputs=[audio_output, status_text] | |
) | |
# Auto-generate on Enter key (optional) | |
text_input.submit( | |
generate_and_update_status, | |
inputs=[text_input, speed_slider, pitch_slider], | |
outputs=[audio_output, status_text] | |
) | |
return demo | |
# Create and launch the interface | |
if __name__ == "__main__": | |
demo = create_interface() | |
# Launch the app | |
demo.launch( | |
server_name="0.0.0.0", # Important for Hugging Face Spaces | |
server_port=7860, # Standard port for HF Spaces | |
share=False, # Set to True if testing locally | |
show_error=True | |
) |