import os import gradio as gr import torch import numpy import librosa import languages_dic from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline title = "Multilanguage Transcription and Translation" availableLang = "Afrikaans, Arabic, Armenian, Azerbaijani, Belarusian, Bosnian, Bulgarian, Catalan, Chinese, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, Galician, German, Greek, Hebrew, Hindi, Hungarian, Icelandic, Indonesian, Italian, Japanese, Kannada, Kazakh, Korean, Latvian, Lithuanian, Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian, Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh." description1 = """

Transcribe an audio file containing a speech in any of the languages listed below and translate it to English.

This demo uses the ASR system Whisper and runs on CPU basis hence responses might be slow.

\n """ + availableLang description2 ="""

Transcribe a recording with your microphone of a speech in any of the languages listed below and translate it to English.

This demo uses the ASR system Whisper and runs on CPU basis hence responses might be slow.

\n """ + availableLang device = "cuda:0" if torch.cuda.is_available() else "cpu" #modelType = "openai/whisper-small" class LM: model={} processor={} pipe={} #LMsizes = ["tiny", "base", "small", "medium", "large"] LMsizes = ["base", "small", "medium"] myLM = LM() for LMsize in myLM.LMsizes: modelType = "openai/whisper-"+LMsize myLM.model[LMsize] = WhisperForConditionalGeneration.from_pretrained(modelType).to(device) myLM.processor[LMsize] = WhisperProcessor.from_pretrained(modelType) myLM.pipe[LMsize] = pipeline(task="automatic-speech-recognition", model=modelType, device=device, chunk_length_s=29, stride_length_s=[5,0]) def detect_language(audio_path, model, processor, asr_pipe_whisper): #Is not possible to retrieve the predicted language directly or using a pipeline. Instead: # Loads and resample the audio file to 16kHz, convert to mono and control the duration of the audio input to 20sec speech_data, sampling_rate = librosa.load(audio_path, sr=16000, mono=True, duration=20) #get the input features using the feature extractor on the raw speech data input_features = processor.feature_extractor(speech_data, return_tensors="pt", sampling_rate=sampling_rate).input_features.to(device) #transcribe the input tensor of features obtained from function preAudioPath predicted_ids = model.generate(input_features, task="transcribe") #decode the second entry from the output array which conatins the detected language detected_lang = asr_pipe_whisper.tokenizer.decode(predicted_ids[0,1]) #looks up in the dictionary to retrieve the expanded language name. E.g. detected_lang = "<|ge|>" returns detected_lang = "german" detected_lang = languages_dic.LANGUAGES.get(detected_lang.strip("<|>")) return detected_lang # def transcribe(inputs): # # predicted_ids = model.generate(inputs, language="<|es|>", task="transcribe") # predicted_ids = model.generate(inputs, task="transcribe") # transcription = processor.tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0] # return transcription def getLM(modelsize): modelsize = modelsize.split(" ") if len(modelsize) > 0: modelsize = modelsize[0] return (myLM.model[modelsize], myLM.processor[modelsize], myLM.pipe[modelsize]) def processAudio(audio_path, modelsize): model, processor, asr_pipe_whisper = getLM(modelsize) translation = asr_pipe_whisper(audio_path, max_new_tokens=256, generate_kwargs={"task":"translate"}) transcription = asr_pipe_whisper(audio_path, generate_kwargs={"task":"transcribe"}) #transcription = transcribe(preprocessAudioPath(audio_path)) inputLang = detect_language(audio_path, model, processor, asr_pipe_whisper) return (inputLang, transcription["text"], translation["text"]) modelsizeInfo = "Try out the performance for different model sizes. Larger models are more robust and deliver better results but are also slower." app1 = gr.Interface( fn=processAudio, # inputs=[gr.Audio(source="upload", type="filepath",label="Audio Input"), # gr.Radio(["tiny - 39M", "base - 74M", "small -244M", "medium - 769M", "large - 1550M"], # label="Select the model size", info=modelsizeInfo, value="small -244M")], inputs=[gr.Audio(source="upload", type="filepath",label="Audio Input"), gr.Radio(["base - 74M", "small -244M", "medium - 769M"], label="Select the model size", info=modelsizeInfo, value="small -244M")], outputs=[gr.Textbox(label="Detected input language"), gr.Textbox(label="Transcription"), gr.Textbox(label="Translation to english")], title=title, description=description1 ) app2 = gr.Interface( fn=processAudio, inputs=[gr.Audio(source="microphone", type="filepath",label="Audio Input"), gr.Radio(["base - 74M", "small -244M", "medium - 769M"], label="Select the model size", info=modelsizeInfo, value="small -244M")], outputs=[gr.Textbox(label="Detected input language"), gr.Textbox(label="Transcription"), gr.Textbox(label="Translation to english")], title=title, description=description2 ) demo = gr.TabbedInterface([app1, app2], ["Audio File", "Microphone"]) if __name__ == "__main__": demo.launch()