Spaces:
Runtime error
Runtime error
import torch | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
import librosa | |
import numpy as np | |
from pydub import AudioSegment | |
from config import SPEECH_MODEL, TTS_MODEL | |
# Initialize models | |
processor = Wav2Vec2Processor.from_pretrained(SPEECH_MODEL) | |
model = Wav2Vec2ForCTC.from_pretrained(SPEECH_MODEL) | |
def speech_to_text(audio_file): | |
audio_input, _ = librosa.load(audio_file, sr=16000) | |
input_values = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values | |
logits = model(input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = processor.batch_decode(predicted_ids)[0] | |
return transcription | |
def text_to_speech(text): | |
tts_model = torch.hub.load('snakers4/silero-models', 'silero_tts', model_name=TTS_MODEL) | |
audio = tts_model.apply_tts(text=text, speaker='en_0', sample_rate=48000) | |
# Convert the audio tensor to a numpy array | |
audio_np = audio.numpy() | |
# Normalize the audio to 16-bit PCM range | |
audio_np = (audio_np * 32767).astype(np.int16) | |
# Create an AudioSegment directly from the numpy array | |
audio_segment = AudioSegment( | |
audio_np.tobytes(), | |
frame_rate=48000, | |
sample_width=2, | |
channels=1 | |
) | |
return audio_segment | |
def process_audio_chunk(chunk): | |
# Assuming chunk is a byte string of raw audio data | |
audio_np = np.frombuffer(chunk, dtype=np.int16) | |
audio_float = audio_np.astype(np.float32) / 32768.0 | |
return speech_to_text(audio_float) | |