import streamlit as st
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import numpy as np
import tempfile
import wave
# Load Wav2Vec2 model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
# Streamlit App
st.title("Phonics/Personalized Reading App")
st.write("Record your audio and we will transcribe it.")
# Audio recording using HTML5
record_button = st.button("Record Audio")
if record_button:
st.markdown("""
""", unsafe_allow_html=True)
# Display the transcription
if 'transcription' in st.session_state:
st.write("Transcription:")
st.write(st.session_state.transcription)
# Handle audio file upload
uploaded_file = st.file_uploader("Or upload your audio file", type=["wav", "mp3"])
if uploaded_file is not None:
# Save uploaded audio file to a temporary file
with tempfile.NamedTemporaryFile(delete=True) as temp_file:
temp_file.write(uploaded_file.read())
temp_file.flush()
# Process the audio file for transcription
audio_input = processor(temp_file.name, sampling_rate=16000, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model(audio_input.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
st.session_state.transcription = transcription[0] # Store transcription
st.experimental_rerun() # Refresh the app to show the transcription