import gradio as gr from transformers import pipeline import torch import torchaudio from speechbrain.pretrained import EncoderClassifier # Set up pipe for whisper asr asr_pipe = pipeline( "automatic-speech-recognition", model="openai/whisper-base.en", torch_dtype=torch.float32, device="cpu", ) # Set up pipe for 2 phonemic transcription models american_phoneme_pipe = pipeline("automatic-speech-recognition", model="vitouphy/wav2vec2-xls-r-300m-timit-phoneme") esl_phoneme_pipe = pipeline("automatic-speech-recognition", model="mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme") # Set up pipe for 2 accent classification models classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa") def native_accent_classifier(file): out_prob, score, index, text_lab = classifier.classify_file(file) return [{'accent': text_lab[0], 'score': round(score[0],2)}] def esl_accent_classifier(file): esl_accent_pipe = pipeline( "audio-classification", model="kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2" ) audio, sr = torchaudio.load(file) # Load audio audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio) audio = audio.squeeze().numpy() result = esl_accent_pipe(audio, top_k=6) return [{'accent': result[0]['label'], 'score': round(result[0]['score'],2)}] def transcribe_and_classify_speech(audio): try: asr_output = asr_pipe( audio, max_new_tokens=256, chunk_length_s=30, batch_size=8, )["text"] except Exception as e: print(f"An error occurred with openai/whisper-base.en: {e}") asr_output = "Error, make sure your file is in mono format" try: american_phoneme_output = american_phoneme_pipe(audio)['text'] except Exception as e: print(f"An error occurred with wav2vec2-xls-r-300m-timit-phoneme: {e}") american_phoneme_output = "Error, make sure your file is in mono format" try: esl_phoneme_output = esl_phoneme_pipe(audio)['text'] except Exception as e: print(f"An error occurred with mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme: {e}") esl_phoneme_output = "Error" try: native_accent_output = native_accent_classifier(audio) except Exception as e: print(f"An error occurred with Jzuluaga/accent-id-commonaccent_ecapa: {e}") native_accent_output = [{'accent': 'Unknown-please upload single channel audio'}, {'score': .0}] try: esl_accent_output = esl_accent_classifier(audio) except Exception as e: print(f"An error occurred with kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2: {e}") esl_accent_output = [{'accent': 'Unknown-please upload single channel audio'}, {'score': .0}] output = [ {'transcription': asr_output}, {'phonemes_native_eng': american_phoneme_output}, {'phonemes_eng_second_lang': esl_phoneme_output}, {'native_eng_country': native_accent_output}, {'first_lang_if_not_eng': esl_accent_output} ] return output demo = gr.Blocks() examples = [['chinese-american.wav'], ['mexican.wav'], ['vietnamese.wav'], ['indian.wav'], ['nigerian.wav']] mic_transcribe = gr.Interface( fn=transcribe_and_classify_speech, inputs=gr.Audio(sources="microphone", type="filepath"), outputs=gr.components.Textbox(), examples=examples, ) file_transcribe = gr.Interface( fn=transcribe_and_classify_speech, inputs=gr.Audio(sources="upload", type="filepath"), outputs=gr.components.Textbox(), examples=examples, ) # Launch gradio app demo with demo: gr.TabbedInterface( [mic_transcribe, file_transcribe], ["Transcribe Microphone", "Transcribe Audio File"], ) demo.launch(debug=True) #def greet(name): # return "Hello " + name + "!!" #demo = gr.Interface(fn=greet, inputs="text", outputs="text") #demo.launch()