import gradio as gr import torch from transformers import AutoTokenizer, AutoModel import numpy as np import soundfile as sf import io import tempfile import os # Load your fine-tuned model MODEL_NAME = "m3nnoun/lora_model_semantic" def load_model(): """Load the TTS model and tokenizer""" try: # Adjust these based on your specific model architecture tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModel.from_pretrained(MODEL_NAME) model.eval() return tokenizer, model except Exception as e: print(f"Error loading model: {e}") return None, None # Initialize model tokenizer, model = load_model() def text_to_speech(text, voice_speed=1.0, voice_pitch=1.0): """ Convert text to speech using your fine-tuned model Args: text (str): Input text to convert to speech voice_speed (float): Speed of the generated speech voice_pitch (float): Pitch of the generated speech Returns: tuple: (sample_rate, audio_array) for Gradio audio output """ if not text.strip(): return None if tokenizer is None or model is None: return None try: # Tokenize input text inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True) # Generate speech with your model with torch.no_grad(): # This is a placeholder - adjust based on your model's actual interface # Different TTS models have different forward pass requirements outputs = model(**inputs) # Extract audio from model outputs # This part depends on your model's output format if hasattr(outputs, 'audio'): audio = outputs.audio elif hasattr(outputs, 'waveform'): audio = outputs.waveform else: # If output is different, extract the audio tensor audio = outputs.last_hidden_state # Adjust based on your model # Convert to numpy array if torch.is_tensor(audio): audio = audio.squeeze().cpu().numpy() # Apply speed and pitch modifications (basic implementation) if voice_speed != 1.0: # Simple speed change by resampling indices = np.arange(0, len(audio), voice_speed) audio = np.interp(indices, np.arange(len(audio)), audio) # Ensure audio is in the right format audio = np.array(audio, dtype=np.float32) # Normalize audio if len(audio) > 0: audio = audio / np.max(np.abs(audio)) # Return sample rate and audio array sample_rate = 22050 # Adjust based on your model's sample rate return sample_rate, audio except Exception as e: print(f"Error in text_to_speech: {e}") return None def create_interface(): """Create the Gradio interface""" with gr.Blocks(title="TTS Model - Text to Speech", theme=gr.themes.Soft()) as demo: gr.Markdown( """ # 🎙️ Text-to-Speech Generator Enter your text below and generate high-quality speech using our fine-tuned TTS model. """ ) with gr.Row(): with gr.Column(scale=2): # Text input text_input = gr.Textbox( label="Enter Text", placeholder="Type the text you want to convert to speech...", lines=4, max_lines=10 ) # Voice controls with gr.Row(): speed_slider = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speech Speed" ) pitch_slider = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speech Pitch" ) # Generate button generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg") with gr.Column(scale=1): # Audio output audio_output = gr.Audio( label="Generated Speech", type="numpy", interactive=False ) # Status/Info status_text = gr.Textbox( label="Status", value="Ready to generate speech", interactive=False, lines=2 ) # Example texts gr.Markdown("### 📝 Example Texts") examples = gr.Examples( examples=[ ["Hello! Welcome to our text-to-speech service."], ["The quick brown fox jumps over the lazy dog."], ["Artificial intelligence is revolutionizing how we interact with technology."], ["Thank you for using our TTS model. We hope you enjoy the generated speech!"] ], inputs=[text_input], label="Click on an example to try it" ) # Event handlers def generate_and_update_status(text, speed, pitch): if not text.strip(): return None, "⚠️ Please enter some text to generate speech." try: result = text_to_speech(text, speed, pitch) if result is None: return None, "❌ Error generating speech. Please try again." sample_rate, audio = result return (sample_rate, audio), f"✅ Speech generated successfully! Duration: {len(audio)/sample_rate:.2f} seconds" except Exception as e: return None, f"❌ Error: {str(e)}" generate_btn.click( generate_and_update_status, inputs=[text_input, speed_slider, pitch_slider], outputs=[audio_output, status_text] ) # Auto-generate on Enter key (optional) text_input.submit( generate_and_update_status, inputs=[text_input, speed_slider, pitch_slider], outputs=[audio_output, status_text] ) return demo # Create and launch the interface if __name__ == "__main__": demo = create_interface() # Launch the app demo.launch( server_name="0.0.0.0", # Important for Hugging Face Spaces server_port=7860, # Standard port for HF Spaces share=False, # Set to True if testing locally show_error=True )