Spaces:
Sleeping
Sleeping
File size: 7,024 Bytes
e02a244 2875d75 b42ba1f f6c8e89 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 b42ba1f 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 2875d75 d5920d2 b42ba1f d5920d2 e02a244 d5920d2 2875d75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import soundfile as sf
import io
import tempfile
import os
# Load your fine-tuned model
MODEL_NAME = "m3nnoun/lora_model_semantic"
def load_model():
"""Load the TTS model and tokenizer"""
try:
# Adjust these based on your specific model architecture
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.eval()
return tokenizer, model
except Exception as e:
print(f"Error loading model: {e}")
return None, None
# Initialize model
tokenizer, model = load_model()
def text_to_speech(text, voice_speed=1.0, voice_pitch=1.0):
"""
Convert text to speech using your fine-tuned model
Args:
text (str): Input text to convert to speech
voice_speed (float): Speed of the generated speech
voice_pitch (float): Pitch of the generated speech
Returns:
tuple: (sample_rate, audio_array) for Gradio audio output
"""
if not text.strip():
return None
if tokenizer is None or model is None:
return None
try:
# Tokenize input text
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
# Generate speech with your model
with torch.no_grad():
# This is a placeholder - adjust based on your model's actual interface
# Different TTS models have different forward pass requirements
outputs = model(**inputs)
# Extract audio from model outputs
# This part depends on your model's output format
if hasattr(outputs, 'audio'):
audio = outputs.audio
elif hasattr(outputs, 'waveform'):
audio = outputs.waveform
else:
# If output is different, extract the audio tensor
audio = outputs.last_hidden_state # Adjust based on your model
# Convert to numpy array
if torch.is_tensor(audio):
audio = audio.squeeze().cpu().numpy()
# Apply speed and pitch modifications (basic implementation)
if voice_speed != 1.0:
# Simple speed change by resampling
indices = np.arange(0, len(audio), voice_speed)
audio = np.interp(indices, np.arange(len(audio)), audio)
# Ensure audio is in the right format
audio = np.array(audio, dtype=np.float32)
# Normalize audio
if len(audio) > 0:
audio = audio / np.max(np.abs(audio))
# Return sample rate and audio array
sample_rate = 22050 # Adjust based on your model's sample rate
return sample_rate, audio
except Exception as e:
print(f"Error in text_to_speech: {e}")
return None
def create_interface():
"""Create the Gradio interface"""
with gr.Blocks(title="TTS Model - Text to Speech", theme=gr.themes.Soft()) as demo:
gr.Markdown(
"""
# ποΈ Text-to-Speech Generator
Enter your text below and generate high-quality speech using our fine-tuned TTS model.
"""
)
with gr.Row():
with gr.Column(scale=2):
# Text input
text_input = gr.Textbox(
label="Enter Text",
placeholder="Type the text you want to convert to speech...",
lines=4,
max_lines=10
)
# Voice controls
with gr.Row():
speed_slider = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label="Speech Speed"
)
pitch_slider = gr.Slider(
minimum=0.5,
maximum=2.0,
value=1.0,
step=0.1,
label="Speech Pitch"
)
# Generate button
generate_btn = gr.Button("π΅ Generate Speech", variant="primary", size="lg")
with gr.Column(scale=1):
# Audio output
audio_output = gr.Audio(
label="Generated Speech",
type="numpy",
interactive=False
)
# Status/Info
status_text = gr.Textbox(
label="Status",
value="Ready to generate speech",
interactive=False,
lines=2
)
# Example texts
gr.Markdown("### π Example Texts")
examples = gr.Examples(
examples=[
["Hello! Welcome to our text-to-speech service."],
["The quick brown fox jumps over the lazy dog."],
["Artificial intelligence is revolutionizing how we interact with technology."],
["Thank you for using our TTS model. We hope you enjoy the generated speech!"]
],
inputs=[text_input],
label="Click on an example to try it"
)
# Event handlers
def generate_and_update_status(text, speed, pitch):
if not text.strip():
return None, "β οΈ Please enter some text to generate speech."
try:
result = text_to_speech(text, speed, pitch)
if result is None:
return None, "β Error generating speech. Please try again."
sample_rate, audio = result
return (sample_rate, audio), f"β
Speech generated successfully! Duration: {len(audio)/sample_rate:.2f} seconds"
except Exception as e:
return None, f"β Error: {str(e)}"
generate_btn.click(
generate_and_update_status,
inputs=[text_input, speed_slider, pitch_slider],
outputs=[audio_output, status_text]
)
# Auto-generate on Enter key (optional)
text_input.submit(
generate_and_update_status,
inputs=[text_input, speed_slider, pitch_slider],
outputs=[audio_output, status_text]
)
return demo
# Create and launch the interface
if __name__ == "__main__":
demo = create_interface()
# Launch the app
demo.launch(
server_name="0.0.0.0", # Important for Hugging Face Spaces
server_port=7860, # Standard port for HF Spaces
share=False, # Set to True if testing locally
show_error=True
) |