File size: 2,716 Bytes
16e1a19
8a0fbba
 
09723b9
8a0fbba
 
 
 
 
1a76809
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a0fbba
 
 
09723b9
8a0fbba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16e1a19
8a0fbba
09723b9
8a0fbba
 
 
16e1a19
 
8a0fbba
16e1a19
1a76809
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import torchaudio

# Load the pre-trained Wav2Vec 2.0 model and processor from Hugging Face
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# Function to convert speech to text
def speech_to_text(audio_file):
    # Load the audio file
    audio_input, _ = torchaudio.load(audio_file)
    
    # Preprocess the audio input (e.g., resample, normalize, etc.)
    input_values = processor(audio_input, return_tensors="pt").input_values
    
    # Perform speech-to-text (CTC Decoding)
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)

    # Decode the predicted ids to text
    transcription = processor.decode(predicted_ids[0])
    
    return transcription

# Set up the Gradio interface
iface = gr.Interface(
    fn=speech_to_text,  # Function to be executed
    inputs=gr.Audio(type="filepath"),  # Correct type for file upload
    outputs=gr.Textbox(),  # Display transcription in a text box
    title="Speech-to-Text Analyzer for Lectimport gradio as gr
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import torchaudio

# Load the pre-trained Wav2Vec 2.0 model and processor from Hugging Face
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# Function to convert speech to text
def speech_to_text(audio_file):
    # Load the audio file
    audio_input, _ = torchaudio.load(audio_file)
    
    # Preprocess the audio input (e.g., resample, normalize, etc.)
    input_values = processor(audio_input, return_tensors="pt").input_values
    
    # Perform speech-to-text (CTC Decoding)
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)

    # Decode the predicted ids to text
    transcription = processor.decode(predicted_ids[0])
    
    return transcription

# Set up the Gradio interface
iface = gr.Interface(
    fn=speech_to_text,  # Function to be executed
    inputs=gr.Audio(type="filepath"),  # Correct type for file upload
    outputs=gr.Textbox(),  # Display transcription in a text box
    title="Speech-to-Text Analyzer for Lecture Notes",
    description="Upload an audio file (e.g., lecture recording) to get the transcription of the speech."
)

# Launch the interface
iface.launch()
ure Notes",
    description="Upload an audio file (e.g., lecture recording) to get the transcription of the speech."
)

# Launch the interface
iface.launch()