File size: 5,709 Bytes
0588955
7df1980
 
 
821dff7
 
7df1980
 
 
 
 
 
 
19a2ea8
 
 
0588955
19a2ea8
 
 
 
 
 
 
0588955
19a2ea8
 
 
 
 
 
 
f847932
 
 
 
19a2ea8
f847932
19a2ea8
 
 
 
f847932
 
19a2ea8
 
 
 
821dff7
 
 
 
19a2ea8
 
4855128
19a2ea8
 
 
 
 
 
 
 
 
4855128
19a2ea8
 
 
 
 
4855128
19a2ea8
 
 
 
 
f847932
19a2ea8
 
f847932
19a2ea8
 
f847932
19a2ea8
 
 
 
 
 
 
 
 
 
 
 
 
d40ccca
19a2ea8
 
d40ccca
 
 
 
 
 
4855128
d40ccca
 
 
 
 
 
 
 
 
821dff7
 
d40ccca
 
 
 
 
 
 
 
19a2ea8
d40ccca
f847932
d40ccca
19a2ea8
5a81b6d
 
821dff7
25b942e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import gradio as gr
import os
import subprocess

API_KEY = "682d2362-894c-800c-af30-a4c56b7f074b"

try:
    from transformers import pipeline
except ModuleNotFoundError:
    print("Installing transformers...")
    subprocess.check_call(["pip", "install", "transformers"])
    from transformers import pipeline  # Retry import

import torch
import torchaudio
from speechbrain.pretrained import EncoderClassifier

# Set up pipe for whisper asr
asr_pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-base.en",
    torch_dtype=torch.float32,
    device="cpu",
)

# Set up pipe for 2 phonemic transcription models
american_phoneme_pipe = pipeline("automatic-speech-recognition", model="vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
esl_phoneme_pipe = pipeline("automatic-speech-recognition", model="mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme")

# Set up pipe for 2 accent classification models
classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa")

def native_accent_classifier(file):
  out_prob, score, index, text_lab = classifier.classify_file(file)
  rounded_score = round(score.item(), 2)
  return [{'accent': text_lab[0], 'score': rounded_score}]

def esl_accent_classifier(file):
  esl_accent_pipe = pipeline(
    "audio-classification",
    model="kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2"
  )
  audio, sr = torchaudio.load(file)  # Load audio
  audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio)
  audio = audio.squeeze().numpy()
  result = esl_accent_pipe(audio, top_k=6)
  return [{'accent': result[0]['label'], 'score': round(result[0]['score'],2)}]

def transcribe_and_classify_speech(file, key):
  if key != API_KEY:
     raise gr.Error("Invalid API key.")

  try:
      asr_output = asr_pipe(
        file,
        max_new_tokens=256,
        chunk_length_s=30,
        batch_size=8,
      )["text"]
  except Exception as e:
    print(f"An error occurred with openai/whisper-base.en: {e}")
    asr_output = "Error, make sure your file is in mono format"

  try:
    american_phoneme_output = american_phoneme_pipe(file)['text']
  except Exception as e:
    print(f"An error occurred with wav2vec2-xls-r-300m-timit-phoneme: {e}")
    american_phoneme_output = "Error, make sure your file is in mono format"

  try:
    esl_phoneme_output = esl_phoneme_pipe(file)['text']
  except Exception as e:
    print(f"An error occurred with mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme: {e}")
    esl_phoneme_output = "Error"

  try:
    native_accent_output = native_accent_classifier(file)
  except Exception as e:
    print(f"An error occurred with Jzuluaga/accent-id-commonaccent_ecapa: {e}")
    native_accent_output = [{'accent': "Error"}, {'score': .0}]

  try:
    esl_accent_output = esl_accent_classifier(file)
  except Exception as e:
    print(f"An error occurred with kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2: {e}")
    esl_accent_output = [{'accent': 'Unknown-please upload single channel audio'}, {'score': .0}]

  output = [
      {'transcription': asr_output},
      {'phonemes_native_eng': american_phoneme_output},
      {'phonemes_eng_second_lang': esl_phoneme_output},
      {'native_eng_country': native_accent_output},
      {'first_lang_if_not_eng': esl_accent_output}
  ]
  return output

## Set up gradio app
demo = gr.Blocks()

examples = [['chinese-american.wav'], ['mexican.wav'], ['vietnamese.wav'], ['indian.wav'], ['nigerian.wav'], ['irish.wav']]

# Create a function to generate a vertically stacked interface
def create_transcription_interface(source):
    with gr.Blocks() as interface:
        gr.Markdown("""
        Use microphone, upload .wav file, or choose an example below. Output will include results from the following models: 
          - Transcription from OpenAI's Whisper [openai/whisper-base.en](https://huggingface.co/openai/whisper-base.en)
          - Phonemic transcription trained on native English speakers [vitouphy/wav2vec2-xls-r-300m-timit-phoneme](https://huggingface.co/vitouphy/wav2vec2-xls-r-300m-timit-phoneme)
          - Phonemic transcription trained on speakers of English as a second language [mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme](https://huggingface.co/mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme)
          - Accent classification trained on native English speakers [Jzuluaga/accent-id-commonaccent_ecapa](https://huggingface.co/Jzuluaga/accent-id-commonaccent_ecapa)
          - Accent classification trained on speakers of English as a second language [kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2](https://huggingface.co/kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2)
        """)
        with gr.Column():
            audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio")
            output = gr.JSON(label="Results")
            api_key_input = gr.Textbox(label="API Key", type="password")
        audio_input.change(fn=transcribe_and_classify_speech, inputs=[audio_input, api_key_input], outputs=output)
        gr.Examples(examples=examples, inputs=[audio_input])
    return interface

# Create two interfaces (one for mic, one for file upload)
mic_transcribe = create_transcription_interface("microphone")
file_transcribe = create_transcription_interface("upload")

demo = gr.TabbedInterface(
    [mic_transcribe, file_transcribe],
    ["Microphone Input", "Upload .wav file"],
    title="Speech Recognition and Accent Classification",
)

# demo.launch()
# demo.launch(debug=True)
# demo.launch(strict_cors=False, share=True) # works with front end but insecure
demo.launch(share=True)