import torch import soundfile as sf import os import re from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from speechbrain.pretrained import EncoderClassifier # Define paths and device model_path = "HAMMALE/speecht5-darija" # Path to your model on HF Hub device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Load models processor = SpeechT5Processor.from_pretrained(model_path) model = SpeechT5ForTextToSpeech.from_pretrained(model_path).to(device) vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device) # Load speaker embedding model speaker_model = EncoderClassifier.from_hparams( source="speechbrain/spkrec-xvect-voxceleb", run_opts={"device": device}, savedir=os.path.join("/tmp", "spkrec-xvect-voxceleb"), ) # Load pre-computed speaker embeddings male_embedding = torch.load("male_embedding.pt") if os.path.exists("male_embedding.pt") else torch.randn(1, 512) female_embedding = torch.load("female_embedding.pt") if os.path.exists("female_embedding.pt") else torch.randn(1, 512) # Text normalization function def normalize_text(text): """Normalize text for TTS processing""" text = text.lower() # Keep letters, numbers, spaces and apostrophes - fixed regex text = re.sub(r'[^\w\s\'\u0600-\u06FF]', '', text) text = ' '.join(text.split()) return text # Function to synthesize speech def synthesize_speech(text, voice_type="male", speed=1.0): """Generate speech from text using the specified voice type""" try: # Select speaker embedding based on voice type if voice_type == "male": speaker_embeddings = male_embedding.to(device) else: speaker_embeddings = female_embedding.to(device) # Normalize and tokenize input text normalized_text = normalize_text(text) inputs = processor(text=normalized_text, return_tensors="pt").to(device) # Generate speech with torch.no_grad(): speech = model.generate_speech( inputs["input_ids"], speaker_embeddings, vocoder=vocoder ) # Convert to numpy array and adjust speed if needed speech_np = speech.cpu().numpy() # Apply speed adjustment (simple resampling) if speed != 1.0: # This is a simple approach - for production use a proper resampling library import numpy as np from scipy import signal sample_rate = 16000 new_length = int(len(speech_np) / speed) speech_np = signal.resample(speech_np, new_length) # Save temporary audio file output_file = "output_speech.wav" sf.write(output_file, speech_np, 16000) return output_file, None except Exception as e: return None, f"Error generating speech: {str(e)}" # Gradio imports need to be added import gradio as gr # Custom CSS for better design custom_css = """ .gradio-container { font-family: 'Poppins', 'Arial', sans-serif; max-width: 750px; margin: auto; } .main-header { background: linear-gradient(90deg, #c31432, #240b36); color: white; padding: 1.5em; border-radius: 10px; text-align: center; margin-bottom: 1em; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); } .main-header h1 { font-size: 2.2em; margin-bottom: 0.3em; } .main-header p { font-size: 1.1em; opacity: 0.9; } footer { text-align: center; margin-top: 2em; color: #555; font-size: 0.9em; } .flag-icon { width: 24px; height: 24px; vertical-align: middle; margin-right: 8px; } .example-header { font-weight: bold; color: #c31432; margin-top: 1em; } .info-box { background-color: #f9f9f9; border-left: 4px solid #c31432; padding: 1em; margin: 1em 0; border-radius: 5px; } .voice-selector { display: flex; justify-content: center; gap: 20px; margin: 10px 0; } .voice-option { border: 2px solid #ddd; border-radius: 10px; padding: 10px 15px; transition: all 0.3s ease; cursor: pointer; } .voice-option.selected { border-color: #c31432; background-color: #fff5f5; } .slider-container { margin: 20px 0; } """ # Create Gradio interface with improved design with gr.Blocks(css=custom_css) as demo: gr.HTML( """

🇲🇦 Moroccan Darija Text-to-Speech 🎧

Convert Moroccan Arabic (Darija) text into natural-sounding speech

""" ) with gr.Row(): with gr.Column(): gr.HTML( """

This model was fine-tuned on the DODa audio dataset to produce high-quality Darija speech from text input. You can adjust the voice and speed below.

""" ) text_input = gr.Textbox( label="Enter Darija Text", placeholder="Kteb chi jomla b darija hna...", lines=3 ) with gr.Row(): voice_type = gr.Radio( ["male", "female"], label="Voice Type", value="male" ) speed = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speech Speed" ) generate_btn = gr.Button("Generate Speech", variant="primary") gr.HTML( """
Example phrases:
""" ) with gr.Column(): audio_output = gr.Audio(label="Generated Speech") error_output = gr.Textbox(label="Error (if any)", visible=False) gr.Examples( examples=[ ["Ana Nadi Bezzaaf hhh", "male", 1.0], ["Lyoum ajwaa zwina bezzaf.", "female", 1.0], ["lmaghrib ahssan blad fi l3alam", "male", 1.0], ["Filistine hora mina lbar ila lbahr", "female", 0.8], ], inputs=[text_input, voice_type, speed], outputs=[audio_output, error_output], fn=synthesize_speech ) gr.HTML( """ """ ) # Set button click action generate_btn.click( fn=synthesize_speech, inputs=[text_input, voice_type, speed], outputs=[audio_output, error_output] ) # Launch the demo if __name__ == "__main__": demo.launch()