import os import shutil # Remove Hugging Face cache at startup hf_cache = os.path.expanduser("~/.cache/huggingface") if os.path.exists(hf_cache): shutil.rmtree(hf_cache) import gradio as gr import numpy as np import torchaudio from transformers import pipeline # Load pidgin model via pipeline transcriber = pipeline("automatic-speech-recognition", model="asr-nigerian-pidgin/pidgin-wav2vec2-xlsr53") # Transcription function # def transcribe(audio): # if audio is None: # return "No audio provided." # sr, y = audio # # Convert to mono if stereo # if y.ndim > 1: # y = y.mean(axis=1) # y = y.astype(np.float32) # y /= np.max(np.abs(y)) # return transcriber({"sampling_rate": sr, "raw": y})["text"] def transcribe(audio_filepath): if audio_filepath is None: return "" # load & preprocess waveform, sr = torchaudio.load(audio_filepath) if waveform.shape[0] > 1: waveform = waveform.mean(dim=0, keepdim=True) audio = waveform.squeeze().numpy().astype(np.float32) audio /= np.max(np.abs(audio)) + 1e-9 return transcriber({"sampling_rate": sr, "raw": audio})["text"] # Define the Gradio UI components with gr.Blocks() as demo: gr.Markdown("# 🗣️ Nigerian Pidgin ASR Demo") gr.Markdown("""Upload or record audio in Nigerian Pidgin to get transcription. This Demo uses the Nigerian pidgin ASR checkpoint -[Pidgin-Wav2Vec2-XLSR53](https://huggingface.co/asr-nigerian-pidgin/pidgin-wav2vec2-xlsr53) and 🤗 Transformers to transcribe audio files of max 30s length. """) with gr.Column(): audio_in = gr.Audio( label="🎤 Record or upload your audio", type="filepath", sources=["upload", "microphone"], interactive=True, min_length=1, max_length=31 ) with gr.Row(): submit_btn = gr.Button("Submit") #clear_btn = gr.Button("Clear") with gr.Column(): transcription_txt = gr.Textbox( label="📝 Transcription", interactive=False, show_label=True, show_copy_button=True, ) with gr.Row(): flag_btn = gr.Button("🚩 Flag this output as incorrect", size="sm") share_btn = gr.Button("🔗 Share", size="sm") # Button wiring: submit_btn.click(fn=transcribe, inputs=audio_in, outputs=transcription_txt) flag_btn.click(fn=lambda: "Thank you for your feedback.", inputs=None, outputs=transcription_txt) #clear_btn.click(fn=lambda: (None, ""), inputs=None, outputs=[audio_in, transcription_txt]) demo.launch(share=True)