Spaces:

karthikmn
/

smart-notes

Runtime error

File size: 2,716 Bytes

16e1a19
8a0fbba
 
09723b9
8a0fbba
 
 
 
 
1a76809
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a0fbba
 
 
09723b9
8a0fbba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16e1a19
8a0fbba
09723b9
8a0fbba
 
 
16e1a19
 
8a0fbba
16e1a19
1a76809

import gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import torchaudio

# Load the pre-trained Wav2Vec 2.0 model and processor from Hugging Face
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# Function to convert speech to text
def speech_to_text(audio_file):
    # Load the audio file
    audio_input, _ = torchaudio.load(audio_file)
    
    # Preprocess the audio input (e.g., resample, normalize, etc.)
    input_values = processor(audio_input, return_tensors="pt").input_values
    
    # Perform speech-to-text (CTC Decoding)
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)

    # Decode the predicted ids to text
    transcription = processor.decode(predicted_ids[0])
    
    return transcription

# Set up the Gradio interface
iface = gr.Interface(
    fn=speech_to_text,  # Function to be executed
    inputs=gr.Audio(type="filepath"),  # Correct type for file upload
    outputs=gr.Textbox(),  # Display transcription in a text box
    title="Speech-to-Text Analyzer for Lectimport gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import torchaudio

# Load the pre-trained Wav2Vec 2.0 model and processor from Hugging Face
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# Function to convert speech to text
def speech_to_text(audio_file):
    # Load the audio file
    audio_input, _ = torchaudio.load(audio_file)
    
    # Preprocess the audio input (e.g., resample, normalize, etc.)
    input_values = processor(audio_input, return_tensors="pt").input_values
    
    # Perform speech-to-text (CTC Decoding)
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)

    # Decode the predicted ids to text
    transcription = processor.decode(predicted_ids[0])
    
    return transcription

# Set up the Gradio interface
iface = gr.Interface(
    fn=speech_to_text,  # Function to be executed
    inputs=gr.Audio(type="filepath"),  # Correct type for file upload
    outputs=gr.Textbox(),  # Display transcription in a text box
    title="Speech-to-Text Analyzer for Lecture Notes",
    description="Upload an audio file (e.g., lecture recording) to get the transcription of the speech."
)

# Launch the interface
iface.launch()
ure Notes",
    description="Upload an audio file (e.g., lecture recording) to get the transcription of the speech."
)

# Launch the interface
iface.launch()