import gradio as gr import torch import torchaudio import os import re import subprocess from transformers import AutoModelForCausalLM from yarngpt_utils import AudioTokenizer # Download model files if they don't exist def download_if_not_exists(url, filename): if not os.path.exists(filename): print(f"Downloading {filename}...") subprocess.run(["wget", url, "-O", filename]) print(f"Downloaded {filename}") # Download necessary files download_if_not_exists( "https://huggingface.co/novateur/WavTokenizer-medium-speech-75token/resolve/main/wavtokenizer_mediumdata_frame75_3s_nq1_code4096_dim512_kmeans200_attn.yaml", "wavtokenizer_config.yaml" ) download_if_not_exists( "https://huggingface.co/novateur/WavTokenizer-large-speech-75token/blob/main/wavtokenizer_large_speech_320_v2.ckpt", "wavtokenizer_model.ckpt" ) # Initialize the model (this runs when the app starts) def initialize_model(): # Set paths hf_path = "saheedniyi/YarnGPT" wav_tokenizer_config_path = "wavtokenizer_config.yaml" wav_tokenizer_model_path = "wavtokenizer_model.ckpt" # Create AudioTokenizer audio_tokenizer = AudioTokenizer( hf_path, wav_tokenizer_model_path, wav_tokenizer_config_path ) # Load model model = AutoModelForCausalLM.from_pretrained(hf_path, torch_dtype="auto").to(audio_tokenizer.device) return model, audio_tokenizer # Generate audio from text def generate_speech(text, speaker_name): # Create prompt prompt = audio_tokenizer.create_prompt(text, speaker_name) # Tokenize prompt input_ids = audio_tokenizer.tokenize_prompt(prompt) # Generate output output = model.generate( input_ids=input_ids, temperature=0.1, repetition_penalty=1.1, max_length=4000, ) # Convert to audio codes codes = audio_tokenizer.get_codes(output) # Convert codes to audio audio = audio_tokenizer.get_audio(codes) # Save audio temporarily temp_path = "output.wav" torchaudio.save(temp_path, audio, sample_rate=24000) return temp_path # Load model globally print("Loading model...") model, audio_tokenizer = initialize_model() print("Model loaded!") # Create Gradio interface speakers = ["idera", "emma", "jude", "osagie", "tayo", "zainab", "joke", "regina", "remi", "umar", "chinenye"] demo = gr.Interface( fn=generate_speech, inputs=[ gr.Textbox(lines=5, placeholder="Enter text here..."), gr.Dropdown(choices=speakers, label="Speaker", value="idera") ], outputs=gr.Audio(type="filepath"), title="YarnGPT: Nigerian Accented Text-to-Speech", description="Generate natural-sounding Nigerian accented speech from text." ) demo.launch()