File size: 2,738 Bytes
8ed33a3
 
 
 
 
 
 
 
 
 
4624346
 
f1bd670
4624346
 
 
 
 
 
3f96d2f
 
 
4624346
3f96d2f
4624346
3f96d2f
 
 
4624346
3f96d2f
 
4624346
3f96d2f
 
 
 
 
 
 
 
 
 
4624346
3f96d2f
 
 
4624346
 
 
4b32bec
 
 
 
4624346
7f39ff6
 
 
 
 
4624346
99b8c4e
a7e6318
4624346
7f39ff6
 
3f96d2f
7f39ff6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3f96d2f
4624346
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88

import os
import shutil

# Remove Hugging Face cache at startup
hf_cache = os.path.expanduser("~/.cache/huggingface")
if os.path.exists(hf_cache):
    shutil.rmtree(hf_cache)


import gradio as gr
import numpy as np
import torchaudio
from transformers import pipeline

# Load pidgin model via pipeline
transcriber = pipeline("automatic-speech-recognition", model="asr-nigerian-pidgin/pidgin-wav2vec2-xlsr53")

# Transcription function
# def transcribe(audio):
#     if audio is None:
#         return "No audio provided."
    
#     sr, y = audio

#     # Convert to mono if stereo
#     if y.ndim > 1:
#         y = y.mean(axis=1)
        
#     y = y.astype(np.float32)
#     y /= np.max(np.abs(y))

#     return transcriber({"sampling_rate": sr, "raw": y})["text"] 
def transcribe(audio_filepath):
    if audio_filepath is None:
        return ""
    # load & preprocess
    waveform, sr = torchaudio.load(audio_filepath)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    audio = waveform.squeeze().numpy().astype(np.float32)
    audio /= np.max(np.abs(audio)) + 1e-9

    return transcriber({"sampling_rate": sr, "raw": audio})["text"]

    
# Define the Gradio UI components
with gr.Blocks() as demo:
    gr.Markdown("# πŸ—£οΈ Nigerian Pidgin ASR Demo")
    gr.Markdown("""Upload or record audio in Nigerian Pidgin to get transcription. This Demo uses the 
    Nigerian pidgin ASR checkpoint -[Pidgin-Wav2Vec2-XLSR53](https://huggingface.co/asr-nigerian-pidgin/pidgin-wav2vec2-xlsr53)
     and πŸ€— Transformers to transcribe audio files of max 30s length.
    """)

    with gr.Column():
        audio_in = gr.Audio(
            label="🎀 Record or upload your audio",
            type="filepath",    
            sources=["upload", "microphone"],
            interactive=True,
            min_length=1,
            max_length=31
        )
        with gr.Row():
            submit_btn = gr.Button("Submit")
            #clear_btn  = gr.Button("Clear")

  
    with gr.Column():
          
          transcription_txt = gr.Textbox(
              label="πŸ“ Transcription",
              interactive=False,
              show_label=True,
              show_copy_button=True,       
          )

          with gr.Row():
              flag_btn = gr.Button("🚩 Flag this output as incorrect", size="sm")
              share_btn = gr.Button("πŸ”— Share", size="sm")

    # Button wiring:
    submit_btn.click(fn=transcribe, inputs=audio_in, outputs=transcription_txt)
    flag_btn.click(fn=lambda: "Thank you for your feedback.", inputs=None, outputs=transcription_txt)
    #clear_btn.click(fn=lambda: (None, ""), inputs=None, outputs=[audio_in, transcription_txt])

demo.launch(share=True)