File size: 2,932 Bytes
654bae8
 
 
 
 
 
f262317
654bae8
 
 
 
 
 
 
 
 
 
 
 
 
 
e7937fb
654bae8
 
 
 
 
 
 
 
 
 
f262317
654bae8
 
 
 
 
 
 
8c89ac4
 
 
f262317
654bae8
f262317
 
654bae8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import torch
import soundfile as sf
from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
from pydub import AudioSegment
import streamlit as st
import tempfile
import librosa

# Define available models
available_models = ['Yehor/w2v-bert-2.0-uk']

st.title("Voice Recognition App")

# Model selection dropdown
model_name = st.selectbox("Choose a model", available_models)

# # Config
# device = 'cpu' # 'cuda:0' # or cpu
# sampling_rate = 16_000

# Load the model
asr_model = AutoModelForCTC.from_pretrained(model_name).to('cpu')
processor = Wav2Vec2BertProcessor.from_pretrained(model_name)

# paths = [
#   'short_1_16k.wav',
# ]

def map_to_pred(file_path, sampling_rate = 16_000, device = 'cpu'):
    audio_inputs = []

    # # load audio file
    audio, _ = librosa.load(file_path)
    #
    # # preprocess audio and generate standard
    # input_features = processor([audio], return_tensors="pt", sampling_rate=16000).input_features
    # generated_ids = model.generate(inputs=input_features)
    # transcription = processor.batch_decode(generated_ids, normalize=True, skip_special_tokens=True)
    # text = processor.tokenizer._normalize(transcription[0])

    # audio_input, _ = sf.read(file_path)
    # audio_inputs.append(audio_input)

    # audio_inputs = AudioSegment.from_file(file_path)
    # Transcribe the audio
    inputs = processor([audio], sampling_rate=sampling_rate).input_features
    # inputs = processor(audio_inputs, sampling_rate=sampling_rate).input_features
    features = torch.tensor(inputs).to(device)

    with torch.no_grad():
        logits = asr_model(features).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    predictions = processor.batch_decode(predicted_ids)

    # Log results
    print('Predictions:')

    return predictions


# Extract audio
# audio_inputs = []
# for path in paths:
#   audio_input, _ = sf.read(path)
#   audio_inputs.append(audio_input)

# # Transcribe the audio
# inputs = processor(audio_inputs, sampling_rate=sampling_rate).input_features
# features = torch.tensor(inputs).to(device)

uploaded_file = st.file_uploader("Choose file", type=["wav", "mp3"])

if uploaded_file is not None:
    # convert file object to file path
    file_path = './temp.wav'
    with open(file_path, 'wb') as f:
        f.write(uploaded_file.getbuffer())

    # Save the uploaded file temporarily
    with tempfile.NamedTemporaryFile(delete=False) as temp_file:
        temp_file.write(uploaded_file.read())
        temp_file_path = temp_file.name

    # Convert audio file to a format supported by Whisper (if necessary)
    audio = AudioSegment.from_file(temp_file_path)
    temp_wav_path = tempfile.mktemp(suffix=".wav")
    audio.export(temp_wav_path, format="wav")

    st.audio(uploaded_file, format="audio/wav")

    text = map_to_pred(file_path)

    # display results
    st.write('Input audio:', uploaded_file.name)
    st.write('Predicted standard:', text)