from .base import MetaItem, BasePipe from ..helpers.vadprocessor import FixedVADIterator import numpy as np import logging # import noisereduce as nr class VadPipe(BasePipe): vac = None sample_rate = 16000 def __init__(self, in_queue=None, out_queue=None) -> None: super().__init__(in_queue, out_queue) self._offset = 0 # 处理的frame size offset self._status = 'END' def reset(self): self._offset = 0 self._status = 'END' self.vac.reset_states() @classmethod def init(cls): if cls.vac is None: cls.vac = FixedVADIterator( threshold=0.6, sampling_rate=cls.sample_rate, # speech_pad_ms=10 min_silence_duration_ms = 100, # speech_pad_ms = 30, ) cls.vac.reset_states() # def reduce_noise(self, data): # return nr.reduce_noise(y=data, sr=self.sample_rate) def _process_speech_chunk(self, source_audio:np.ndarray): speech_dict = self.vac(source_audio, return_seconds=False) if speech_dict: relative_start_frame = None relative_end_frame = None start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end") if start_frame: relative_start_frame =start_frame - self._offset if end_frame: relative_end_frame = end_frame - self._offset return relative_start_frame, relative_end_frame def process(self, in_data: MetaItem) -> MetaItem: if self._offset == 0: self.vac.reset_states() # silence_audio_100ms = np.zeros(int(0.1*self.sample_rate)) source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32) speech_data = self._process_speech_chunk(source_audio) if speech_data: # 表示有音频的变化点出现 rel_start_frame, rel_end_frame = speech_data if rel_start_frame is not None and rel_end_frame is None: self._status = "START" # 语音开始 target_audio = source_audio[max(rel_start_frame-100, 0):] logging.debug("🫸 Speech start frame: {}".format(rel_start_frame)) elif rel_start_frame is None and rel_end_frame is not None: self._status = "END" # 音频结束 target_audio = source_audio[:rel_end_frame] logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame)) else: self._status = 'END' target_audio = source_audio[max(rel_start_frame-100, 0):rel_end_frame] logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame)) # logging.debug("❌ No valid speech segment detected, setting status to END") else: if self._status == 'START': target_audio = source_audio # logging.debug("🔊 Continuing to capture audio as speech is still ongoing") else: # end target_audio = np.array([],dtype=np.float32) # self._status = 'END' # logging.debug("❌ No speech detected, setting status to END") self._offset += len(source_audio) in_data.audio = target_audio.tobytes() in_data.source_audio = b'' in_data.speech_status = self._status return in_data