awacke1's picture
Update app.py
213b75b
raw
history blame contribute delete
985 Bytes
import streamlit as st
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
# Load the model and tokenizer
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
# Define a function to transcribe audio
def transcribe(audio_file):
audio, sample_rate = torchaudio.load(audio_file)
input_values = processor(audio, sampling_rate=sample_rate, return_tensors="pt").input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
return transcription
# Set up the Streamlit app
st.title("Speech Recognition with Wav2Vec2")
audio_file = st.file_uploader("Upload an audio file", type=["mp3", "wav"])
if audio_file is not None:
st.audio(audio_file, format="audio/wav")
transcript = transcribe(audio_file)
st.write("Transcription: ", transcript)