File size: 5,709 Bytes
0588955 7df1980 821dff7 7df1980 19a2ea8 0588955 19a2ea8 0588955 19a2ea8 f847932 19a2ea8 f847932 19a2ea8 f847932 19a2ea8 821dff7 19a2ea8 4855128 19a2ea8 4855128 19a2ea8 4855128 19a2ea8 f847932 19a2ea8 f847932 19a2ea8 f847932 19a2ea8 d40ccca 19a2ea8 d40ccca 4855128 d40ccca 821dff7 d40ccca 19a2ea8 d40ccca f847932 d40ccca 19a2ea8 5a81b6d 821dff7 25b942e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import gradio as gr
import os
import subprocess
API_KEY = "682d2362-894c-800c-af30-a4c56b7f074b"
try:
from transformers import pipeline
except ModuleNotFoundError:
print("Installing transformers...")
subprocess.check_call(["pip", "install", "transformers"])
from transformers import pipeline # Retry import
import torch
import torchaudio
from speechbrain.pretrained import EncoderClassifier
# Set up pipe for whisper asr
asr_pipe = pipeline(
"automatic-speech-recognition",
model="openai/whisper-base.en",
torch_dtype=torch.float32,
device="cpu",
)
# Set up pipe for 2 phonemic transcription models
american_phoneme_pipe = pipeline("automatic-speech-recognition", model="vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
esl_phoneme_pipe = pipeline("automatic-speech-recognition", model="mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme")
# Set up pipe for 2 accent classification models
classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa")
def native_accent_classifier(file):
out_prob, score, index, text_lab = classifier.classify_file(file)
rounded_score = round(score.item(), 2)
return [{'accent': text_lab[0], 'score': rounded_score}]
def esl_accent_classifier(file):
esl_accent_pipe = pipeline(
"audio-classification",
model="kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2"
)
audio, sr = torchaudio.load(file) # Load audio
audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio)
audio = audio.squeeze().numpy()
result = esl_accent_pipe(audio, top_k=6)
return [{'accent': result[0]['label'], 'score': round(result[0]['score'],2)}]
def transcribe_and_classify_speech(file, key):
if key != API_KEY:
raise gr.Error("Invalid API key.")
try:
asr_output = asr_pipe(
file,
max_new_tokens=256,
chunk_length_s=30,
batch_size=8,
)["text"]
except Exception as e:
print(f"An error occurred with openai/whisper-base.en: {e}")
asr_output = "Error, make sure your file is in mono format"
try:
american_phoneme_output = american_phoneme_pipe(file)['text']
except Exception as e:
print(f"An error occurred with wav2vec2-xls-r-300m-timit-phoneme: {e}")
american_phoneme_output = "Error, make sure your file is in mono format"
try:
esl_phoneme_output = esl_phoneme_pipe(file)['text']
except Exception as e:
print(f"An error occurred with mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme: {e}")
esl_phoneme_output = "Error"
try:
native_accent_output = native_accent_classifier(file)
except Exception as e:
print(f"An error occurred with Jzuluaga/accent-id-commonaccent_ecapa: {e}")
native_accent_output = [{'accent': "Error"}, {'score': .0}]
try:
esl_accent_output = esl_accent_classifier(file)
except Exception as e:
print(f"An error occurred with kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2: {e}")
esl_accent_output = [{'accent': 'Unknown-please upload single channel audio'}, {'score': .0}]
output = [
{'transcription': asr_output},
{'phonemes_native_eng': american_phoneme_output},
{'phonemes_eng_second_lang': esl_phoneme_output},
{'native_eng_country': native_accent_output},
{'first_lang_if_not_eng': esl_accent_output}
]
return output
## Set up gradio app
demo = gr.Blocks()
examples = [['chinese-american.wav'], ['mexican.wav'], ['vietnamese.wav'], ['indian.wav'], ['nigerian.wav'], ['irish.wav']]
# Create a function to generate a vertically stacked interface
def create_transcription_interface(source):
with gr.Blocks() as interface:
gr.Markdown("""
Use microphone, upload .wav file, or choose an example below. Output will include results from the following models:
- Transcription from OpenAI's Whisper [openai/whisper-base.en](https://huggingface.co/openai/whisper-base.en)
- Phonemic transcription trained on native English speakers [vitouphy/wav2vec2-xls-r-300m-timit-phoneme](https://huggingface.co/vitouphy/wav2vec2-xls-r-300m-timit-phoneme)
- Phonemic transcription trained on speakers of English as a second language [mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme](https://huggingface.co/mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme)
- Accent classification trained on native English speakers [Jzuluaga/accent-id-commonaccent_ecapa](https://huggingface.co/Jzuluaga/accent-id-commonaccent_ecapa)
- Accent classification trained on speakers of English as a second language [kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2](https://huggingface.co/kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2)
""")
with gr.Column():
audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio")
output = gr.JSON(label="Results")
api_key_input = gr.Textbox(label="API Key", type="password")
audio_input.change(fn=transcribe_and_classify_speech, inputs=[audio_input, api_key_input], outputs=output)
gr.Examples(examples=examples, inputs=[audio_input])
return interface
# Create two interfaces (one for mic, one for file upload)
mic_transcribe = create_transcription_interface("microphone")
file_transcribe = create_transcription_interface("upload")
demo = gr.TabbedInterface(
[mic_transcribe, file_transcribe],
["Microphone Input", "Upload .wav file"],
title="Speech Recognition and Accent Classification",
)
# demo.launch()
# demo.launch(debug=True)
# demo.launch(strict_cors=False, share=True) # works with front end but insecure
demo.launch(share=True) |