import gradio as gr from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import torch import torchaudio # Load the pre-trained Wav2Vec 2.0 model and processor from Hugging Face processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") # Function to convert speech to text def speech_to_text(audio_file): # Load the audio file audio_input, _ = torchaudio.load(audio_file) # Preprocess the audio input (e.g., resample, normalize, etc.) input_values = processor(audio_input, return_tensors="pt").input_values # Perform speech-to-text (CTC Decoding) with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) # Decode the predicted ids to text transcription = processor.decode(predicted_ids[0]) return transcription # Set up the Gradio interface iface = gr.Interface( fn=speech_to_text, # Function to be executed inputs=gr.Audio(type="filepath"), # Correct type for file upload outputs=gr.Textbox(), # Display transcription in a text box title="Speech-to-Text Analyzer for Lectimport gradio as gr from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import torch import torchaudio # Load the pre-trained Wav2Vec 2.0 model and processor from Hugging Face processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h") # Function to convert speech to text def speech_to_text(audio_file): # Load the audio file audio_input, _ = torchaudio.load(audio_file) # Preprocess the audio input (e.g., resample, normalize, etc.) input_values = processor(audio_input, return_tensors="pt").input_values # Perform speech-to-text (CTC Decoding) with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) # Decode the predicted ids to text transcription = processor.decode(predicted_ids[0]) return transcription # Set up the Gradio interface iface = gr.Interface( fn=speech_to_text, # Function to be executed inputs=gr.Audio(type="filepath"), # Correct type for file upload outputs=gr.Textbox(), # Display transcription in a text box title="Speech-to-Text Analyzer for Lecture Notes", description="Upload an audio file (e.g., lecture recording) to get the transcription of the speech." ) # Launch the interface iface.launch() ure Notes", description="Upload an audio file (e.g., lecture recording) to get the transcription of the speech." ) # Launch the interface iface.launch()