Spaces:
Running
Running
File size: 1,178 Bytes
2013214 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 |
import torch
from transformers import AutoProcessor, AutoModelForCTC
import librosa
processor = AutoProcessor.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")
model = AutoModelForCTC.from_pretrained("nguyenvulebinh/wav2vec2-base-vietnamese-250h")
def transcribe(audio_data, sampling_rate):
if len(audio_data.shape) > 1:
audio_data = audio_data.mean(axis=1)
if sampling_rate != 16000:
raise ValueError("Sampling rate phải là 16 kHz")
inputs = processor(audio_data,
sampling_rate=sampling_rate,
return_tensors="pt",
padding="longest")
with torch.no_grad():
logits = model(input_values=inputs.input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
return transcription[0]
def transcribe_file(file_path):
audio_data, sampling_rate = librosa.load(file_path, sr=16000)
return transcribe(audio_data, sampling_rate)
# if __name__ == "__main__":
# file_path = "vn.wav"
# result = transcribe_file(file_path)
# print(result) |