Spaces:

m3nnoun
/

new-vision

Sleeping

File size: 7,024 Bytes

import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import soundfile as sf
import io
import tempfile
import os

# Load your fine-tuned model
MODEL_NAME = "m3nnoun/lora_model_semantic"

def load_model():
    """Load the TTS model and tokenizer"""
    try:
        # Adjust these based on your specific model architecture
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModel.from_pretrained(MODEL_NAME)
        model.eval()
        return tokenizer, model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None, None

# Initialize model
tokenizer, model = load_model()

def text_to_speech(text, voice_speed=1.0, voice_pitch=1.0):
    """
    Convert text to speech using your fine-tuned model
    
    Args:
        text (str): Input text to convert to speech
        voice_speed (float): Speed of the generated speech
        voice_pitch (float): Pitch of the generated speech
    
    Returns:
        tuple: (sample_rate, audio_array) for Gradio audio output
    """
    if not text.strip():
        return None
    
    if tokenizer is None or model is None:
        return None
    
    try:
        # Tokenize input text
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        
        # Generate speech with your model
        with torch.no_grad():
            # This is a placeholder - adjust based on your model's actual interface
            # Different TTS models have different forward pass requirements
            outputs = model(**inputs)
            
            # Extract audio from model outputs
            # This part depends on your model's output format
            if hasattr(outputs, 'audio'):
                audio = outputs.audio
            elif hasattr(outputs, 'waveform'):
                audio = outputs.waveform
            else:
                # If output is different, extract the audio tensor
                audio = outputs.last_hidden_state  # Adjust based on your model
        
        # Convert to numpy array
        if torch.is_tensor(audio):
            audio = audio.squeeze().cpu().numpy()
        
        # Apply speed and pitch modifications (basic implementation)
        if voice_speed != 1.0:
            # Simple speed change by resampling
            indices = np.arange(0, len(audio), voice_speed)
            audio = np.interp(indices, np.arange(len(audio)), audio)
        
        # Ensure audio is in the right format
        audio = np.array(audio, dtype=np.float32)
        
        # Normalize audio
        if len(audio) > 0:
            audio = audio / np.max(np.abs(audio))
        
        # Return sample rate and audio array
        sample_rate = 22050  # Adjust based on your model's sample rate
        return sample_rate, audio
        
    except Exception as e:
        print(f"Error in text_to_speech: {e}")
        return None

def create_interface():
    """Create the Gradio interface"""
    
    with gr.Blocks(title="TTS Model - Text to Speech", theme=gr.themes.Soft()) as demo:
        gr.Markdown(
            """
            # 🎙️ Text-to-Speech Generator
            Enter your text below and generate high-quality speech using our fine-tuned TTS model.
            """
        )
        
        with gr.Row():
            with gr.Column(scale=2):
                # Text input
                text_input = gr.Textbox(
                    label="Enter Text",
                    placeholder="Type the text you want to convert to speech...",
                    lines=4,
                    max_lines=10
                )
                
                # Voice controls
                with gr.Row():
                    speed_slider = gr.Slider(
                        minimum=0.5,
                        maximum=2.0,
                        value=1.0,
                        step=0.1,
                        label="Speech Speed"
                    )
                    pitch_slider = gr.Slider(
                        minimum=0.5,
                        maximum=2.0,
                        value=1.0,
                        step=0.1,
                        label="Speech Pitch"
                    )
                
                # Generate button
                generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
                
            with gr.Column(scale=1):
                # Audio output
                audio_output = gr.Audio(
                    label="Generated Speech",
                    type="numpy",
                    interactive=False
                )
                
                # Status/Info
                status_text = gr.Textbox(
                    label="Status",
                    value="Ready to generate speech",
                    interactive=False,
                    lines=2
                )
        
        # Example texts
        gr.Markdown("### 📝 Example Texts")
        examples = gr.Examples(
            examples=[
                ["Hello! Welcome to our text-to-speech service."],
                ["The quick brown fox jumps over the lazy dog."],
                ["Artificial intelligence is revolutionizing how we interact with technology."],
                ["Thank you for using our TTS model. We hope you enjoy the generated speech!"]
            ],
            inputs=[text_input],
            label="Click on an example to try it"
        )
        
        # Event handlers
        def generate_and_update_status(text, speed, pitch):
            if not text.strip():
                return None, "⚠️ Please enter some text to generate speech."
            
            try:
                result = text_to_speech(text, speed, pitch)
                if result is None:
                    return None, "❌ Error generating speech. Please try again."
                
                sample_rate, audio = result
                return (sample_rate, audio), f"✅ Speech generated successfully! Duration: {len(audio)/sample_rate:.2f} seconds"
                
            except Exception as e:
                return None, f"❌ Error: {str(e)}"
        
        generate_btn.click(
            generate_and_update_status,
            inputs=[text_input, speed_slider, pitch_slider],
            outputs=[audio_output, status_text]
        )
        
        # Auto-generate on Enter key (optional)
        text_input.submit(
            generate_and_update_status,
            inputs=[text_input, speed_slider, pitch_slider],
            outputs=[audio_output, status_text]
        )
    
    return demo

# Create and launch the interface
if __name__ == "__main__":
    demo = create_interface()
    
    # Launch the app
    demo.launch(
        server_name="0.0.0.0",  # Important for Hugging Face Spaces
        server_port=7860,       # Standard port for HF Spaces
        share=False,            # Set to True if testing locally
        show_error=True
    )