import streamlit as st from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import torch import numpy as np import tempfile import wave # Load Wav2Vec2 model and processor processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") # Streamlit App st.title("Phonics/Personalized Reading App") st.write("Record your audio and we will transcribe it.") # Audio recording using HTML5 record_button = st.button("Record Audio") if record_button: st.markdown(""" """, unsafe_allow_html=True) # Display the transcription if 'transcription' in st.session_state: st.write("Transcription:") st.write(st.session_state.transcription) # Handle audio file upload uploaded_file = st.file_uploader("Or upload your audio file", type=["wav", "mp3"]) if uploaded_file is not None: # Save uploaded audio file to a temporary file with tempfile.NamedTemporaryFile(delete=True) as temp_file: temp_file.write(uploaded_file.read()) temp_file.flush() # Process the audio file for transcription audio_input = processor(temp_file.name, sampling_rate=16000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model(audio_input.input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids) st.session_state.transcription = transcription[0] # Store transcription st.experimental_rerun() # Refresh the app to show the transcription