import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import librosa import numpy as np from pydub import AudioSegment from config import SPEECH_MODEL, TTS_MODEL # Initialize models processor = Wav2Vec2Processor.from_pretrained(SPEECH_MODEL) model = Wav2Vec2ForCTC.from_pretrained(SPEECH_MODEL) def speech_to_text(audio_file): audio_input, _ = librosa.load(audio_file, sr=16000) input_values = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.batch_decode(predicted_ids)[0] return transcription def text_to_speech(text): tts_model = torch.hub.load('snakers4/silero-models', 'silero_tts', model_name=TTS_MODEL) audio = tts_model.apply_tts(text=text, speaker='en_0', sample_rate=48000) # Convert the audio tensor to a numpy array audio_np = audio.numpy() # Normalize the audio to 16-bit PCM range audio_np = (audio_np * 32767).astype(np.int16) # Create an AudioSegment directly from the numpy array audio_segment = AudioSegment( audio_np.tobytes(), frame_rate=48000, sample_width=2, channels=1 ) return audio_segment def process_audio_chunk(chunk): # Assuming chunk is a byte string of raw audio data audio_np = np.frombuffer(chunk, dtype=np.int16) audio_float = audio_np.astype(np.float32) / 32768.0 return speech_to_text(audio_float)