import gradio as gr import numpy as np from difflib import Differ import librosa # import spaces #[uncomment to use ZeroGPU] import torch # ################ CHANGE THIS TO CHANGE THE LANGUAGE ###################### # from TaiwaneseHokkien import TaiwaneseHokkien device = "cuda" if torch.cuda.is_available() else "cpu" model_repo_id = "emlinking/wav2vec2-large-xls-r-300m-tsm-asr-v6" if torch.cuda.is_available(): torch_dtype = torch.float16 else: torch_dtype = torch.float32 language = TaiwaneseHokkien(device=device, torch_dtype=torch_dtype) # ########################################################################## # # @spaces.GPU #[uncomment to use ZeroGPU] def infer( audio, target ): if type(audio) != tuple or type(target) != str: return [None, None] # preprocess sampling_rate, wav = audio if wav.ndim > 1: wav = wav.mean(axis=1) wav = wav.astype(np.float32) wav /= np.max(np.abs(wav)) wav = librosa.resample(y=wav, orig_sr=sampling_rate, target_sr=16_000) user_pron = language.asr(wav) # compare texts d_toks = language.compare(target, user_pron) return (user_pron, d_toks) css = """ #col-container { margin: 0 auto; max-width: 640px; } """ with gr.Blocks(css=css) as demo: gr.Markdown(" # PhonoLearn") target = gr.Textbox(label='Practice Sentence (Tâi-lô)') input_audio = gr.Audio( sources=["microphone", "upload"] ) output = gr.Textbox(label='Your Pronunciation') diff = gr.HighlightedText( label='Comparison', combine_adjacent=True, show_legend=True, color_map=language.compare_colors ) input_audio.input(fn=infer, inputs=[input_audio, target], outputs=[output, diff]) if __name__ == "__main__": demo.launch()