Spaces:
Runtime error
Runtime error
File size: 2,716 Bytes
16e1a19 8a0fbba 09723b9 8a0fbba 1a76809 8a0fbba 09723b9 8a0fbba 16e1a19 8a0fbba 09723b9 8a0fbba 16e1a19 8a0fbba 16e1a19 1a76809 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import torchaudio
# Load the pre-trained Wav2Vec 2.0 model and processor from Hugging Face
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
# Function to convert speech to text
def speech_to_text(audio_file):
# Load the audio file
audio_input, _ = torchaudio.load(audio_file)
# Preprocess the audio input (e.g., resample, normalize, etc.)
input_values = processor(audio_input, return_tensors="pt").input_values
# Perform speech-to-text (CTC Decoding)
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
# Decode the predicted ids to text
transcription = processor.decode(predicted_ids[0])
return transcription
# Set up the Gradio interface
iface = gr.Interface(
fn=speech_to_text, # Function to be executed
inputs=gr.Audio(type="filepath"), # Correct type for file upload
outputs=gr.Textbox(), # Display transcription in a text box
title="Speech-to-Text Analyzer for Lectimport gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import torchaudio
# Load the pre-trained Wav2Vec 2.0 model and processor from Hugging Face
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
# Function to convert speech to text
def speech_to_text(audio_file):
# Load the audio file
audio_input, _ = torchaudio.load(audio_file)
# Preprocess the audio input (e.g., resample, normalize, etc.)
input_values = processor(audio_input, return_tensors="pt").input_values
# Perform speech-to-text (CTC Decoding)
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
# Decode the predicted ids to text
transcription = processor.decode(predicted_ids[0])
return transcription
# Set up the Gradio interface
iface = gr.Interface(
fn=speech_to_text, # Function to be executed
inputs=gr.Audio(type="filepath"), # Correct type for file upload
outputs=gr.Textbox(), # Display transcription in a text box
title="Speech-to-Text Analyzer for Lecture Notes",
description="Upload an audio file (e.g., lecture recording) to get the transcription of the speech."
)
# Launch the interface
iface.launch()
ure Notes",
description="Upload an audio file (e.g., lecture recording) to get the transcription of the speech."
)
# Launch the interface
iface.launch()
|