Spaces:
Runtime error
Runtime error
File size: 1,524 Bytes
2ac5bae 3341a76 2ac5bae 3341a76 2ac5bae 3341a76 2ac5bae 3341a76 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import librosa
import numpy as np
from pydub import AudioSegment
from config import SPEECH_MODEL, TTS_MODEL
# Initialize models
processor = Wav2Vec2Processor.from_pretrained(SPEECH_MODEL)
model = Wav2Vec2ForCTC.from_pretrained(SPEECH_MODEL)
def speech_to_text(audio_file):
audio_input, _ = librosa.load(audio_file, sr=16000)
input_values = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
return transcription
def text_to_speech(text):
tts_model = torch.hub.load('snakers4/silero-models', 'silero_tts', model_name=TTS_MODEL)
audio = tts_model.apply_tts(text=text, speaker='en_0', sample_rate=48000)
# Convert the audio tensor to a numpy array
audio_np = audio.numpy()
# Normalize the audio to 16-bit PCM range
audio_np = (audio_np * 32767).astype(np.int16)
# Create an AudioSegment directly from the numpy array
audio_segment = AudioSegment(
audio_np.tobytes(),
frame_rate=48000,
sample_width=2,
channels=1
)
return audio_segment
def process_audio_chunk(chunk):
# Assuming chunk is a byte string of raw audio data
audio_np = np.frombuffer(chunk, dtype=np.int16)
audio_float = audio_np.astype(np.float32) / 32768.0
return speech_to_text(audio_float)
|