File size: 7,024 Bytes
e02a244
2875d75
 
 
 
 
b42ba1f
 
f6c8e89
d5920d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2875d75
d5920d2
 
2875d75
d5920d2
 
 
2875d75
d5920d2
 
 
 
 
2875d75
d5920d2
 
 
 
 
 
 
 
 
2875d75
d5920d2
 
 
2875d75
d5920d2
 
 
 
 
2875d75
d5920d2
 
2875d75
d5920d2
 
2875d75
 
d5920d2
 
 
 
 
 
 
2875d75
 
d5920d2
 
 
2875d75
 
 
d5920d2
2875d75
b42ba1f
2875d75
 
 
d5920d2
2875d75
 
 
d5920d2
2875d75
 
 
d5920d2
2875d75
d5920d2
 
2875d75
d5920d2
2875d75
d5920d2
2875d75
d5920d2
 
 
 
 
 
2875d75
 
d5920d2
2875d75
 
 
d5920d2
 
 
 
 
 
 
 
 
2875d75
d5920d2
2875d75
 
 
 
 
 
 
d5920d2
 
 
 
 
 
 
 
2875d75
 
 
d5920d2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2875d75
d5920d2
 
 
2875d75
 
d5920d2
2875d75
d5920d2
 
 
2875d75
 
d5920d2
b42ba1f
d5920d2
e02a244
d5920d2
 
 
 
 
 
 
 
2875d75
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import soundfile as sf
import io
import tempfile
import os

# Load your fine-tuned model
MODEL_NAME = "m3nnoun/lora_model_semantic"

def load_model():
    """Load the TTS model and tokenizer"""
    try:
        # Adjust these based on your specific model architecture
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModel.from_pretrained(MODEL_NAME)
        model.eval()
        return tokenizer, model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None, None

# Initialize model
tokenizer, model = load_model()

def text_to_speech(text, voice_speed=1.0, voice_pitch=1.0):
    """
    Convert text to speech using your fine-tuned model
    
    Args:
        text (str): Input text to convert to speech
        voice_speed (float): Speed of the generated speech
        voice_pitch (float): Pitch of the generated speech
    
    Returns:
        tuple: (sample_rate, audio_array) for Gradio audio output
    """
    if not text.strip():
        return None
    
    if tokenizer is None or model is None:
        return None
    
    try:
        # Tokenize input text
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        
        # Generate speech with your model
        with torch.no_grad():
            # This is a placeholder - adjust based on your model's actual interface
            # Different TTS models have different forward pass requirements
            outputs = model(**inputs)
            
            # Extract audio from model outputs
            # This part depends on your model's output format
            if hasattr(outputs, 'audio'):
                audio = outputs.audio
            elif hasattr(outputs, 'waveform'):
                audio = outputs.waveform
            else:
                # If output is different, extract the audio tensor
                audio = outputs.last_hidden_state  # Adjust based on your model
        
        # Convert to numpy array
        if torch.is_tensor(audio):
            audio = audio.squeeze().cpu().numpy()
        
        # Apply speed and pitch modifications (basic implementation)
        if voice_speed != 1.0:
            # Simple speed change by resampling
            indices = np.arange(0, len(audio), voice_speed)
            audio = np.interp(indices, np.arange(len(audio)), audio)
        
        # Ensure audio is in the right format
        audio = np.array(audio, dtype=np.float32)
        
        # Normalize audio
        if len(audio) > 0:
            audio = audio / np.max(np.abs(audio))
        
        # Return sample rate and audio array
        sample_rate = 22050  # Adjust based on your model's sample rate
        return sample_rate, audio
        
    except Exception as e:
        print(f"Error in text_to_speech: {e}")
        return None

def create_interface():
    """Create the Gradio interface"""
    
    with gr.Blocks(title="TTS Model - Text to Speech", theme=gr.themes.Soft()) as demo:
        gr.Markdown(
            """
            # πŸŽ™οΈ Text-to-Speech Generator
            Enter your text below and generate high-quality speech using our fine-tuned TTS model.
            """
        )
        
        with gr.Row():
            with gr.Column(scale=2):
                # Text input
                text_input = gr.Textbox(
                    label="Enter Text",
                    placeholder="Type the text you want to convert to speech...",
                    lines=4,
                    max_lines=10
                )
                
                # Voice controls
                with gr.Row():
                    speed_slider = gr.Slider(
                        minimum=0.5,
                        maximum=2.0,
                        value=1.0,
                        step=0.1,
                        label="Speech Speed"
                    )
                    pitch_slider = gr.Slider(
                        minimum=0.5,
                        maximum=2.0,
                        value=1.0,
                        step=0.1,
                        label="Speech Pitch"
                    )
                
                # Generate button
                generate_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg")
                
            with gr.Column(scale=1):
                # Audio output
                audio_output = gr.Audio(
                    label="Generated Speech",
                    type="numpy",
                    interactive=False
                )
                
                # Status/Info
                status_text = gr.Textbox(
                    label="Status",
                    value="Ready to generate speech",
                    interactive=False,
                    lines=2
                )
        
        # Example texts
        gr.Markdown("### πŸ“ Example Texts")
        examples = gr.Examples(
            examples=[
                ["Hello! Welcome to our text-to-speech service."],
                ["The quick brown fox jumps over the lazy dog."],
                ["Artificial intelligence is revolutionizing how we interact with technology."],
                ["Thank you for using our TTS model. We hope you enjoy the generated speech!"]
            ],
            inputs=[text_input],
            label="Click on an example to try it"
        )
        
        # Event handlers
        def generate_and_update_status(text, speed, pitch):
            if not text.strip():
                return None, "⚠️ Please enter some text to generate speech."
            
            try:
                result = text_to_speech(text, speed, pitch)
                if result is None:
                    return None, "❌ Error generating speech. Please try again."
                
                sample_rate, audio = result
                return (sample_rate, audio), f"βœ… Speech generated successfully! Duration: {len(audio)/sample_rate:.2f} seconds"
                
            except Exception as e:
                return None, f"❌ Error: {str(e)}"
        
        generate_btn.click(
            generate_and_update_status,
            inputs=[text_input, speed_slider, pitch_slider],
            outputs=[audio_output, status_text]
        )
        
        # Auto-generate on Enter key (optional)
        text_input.submit(
            generate_and_update_status,
            inputs=[text_input, speed_slider, pitch_slider],
            outputs=[audio_output, status_text]
        )
    
    return demo

# Create and launch the interface
if __name__ == "__main__":
    demo = create_interface()
    
    # Launch the app
    demo.launch(
        server_name="0.0.0.0",  # Important for Hugging Face Spaces
        server_port=7860,       # Standard port for HF Spaces
        share=False,            # Set to True if testing locally
        show_error=True
    )