daihui.zhang
commited on
Commit
·
aca5e0b
1
Parent(s):
2d75b7c
add vad update_silence_ms adapter
Browse files
transcribe/helpers/vadprocessor.py
CHANGED
|
@@ -9,6 +9,38 @@ import logging
|
|
| 9 |
from datetime import timedelta
|
| 10 |
import gc
|
| 11 |
from pydub import AudioSegment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
class OnnxWrapper():
|
|
|
|
| 9 |
from datetime import timedelta
|
| 10 |
import gc
|
| 11 |
from pydub import AudioSegment
|
| 12 |
+
from collections import deque
|
| 13 |
+
|
| 14 |
+
class AdaptiveSilenceController:
|
| 15 |
+
def __init__(self, base_silence_ms=120, min_ms=50, max_ms=600):
|
| 16 |
+
self.base = base_silence_ms
|
| 17 |
+
self.min = min_ms
|
| 18 |
+
self.max = max_ms
|
| 19 |
+
self.recent_silences = deque(maxlen=20)
|
| 20 |
+
self.recent_speeches = deque(maxlen=20)
|
| 21 |
+
|
| 22 |
+
def update_silence(self, duration_ms):
|
| 23 |
+
self.recent_silences.append(duration_ms)
|
| 24 |
+
|
| 25 |
+
def update_speech(self, duration_ms):
|
| 26 |
+
self.recent_speeches.append(duration_ms)
|
| 27 |
+
|
| 28 |
+
def get_adaptive_silence_ms(self):
|
| 29 |
+
# 1. 快速说话特征:平均语音段长度短(如 < 250ms)
|
| 30 |
+
avg_speech = np.mean(self.recent_speeches) if self.recent_speeches else self.base
|
| 31 |
+
avg_silence = np.mean(self.recent_silences) if self.recent_silences else self.base
|
| 32 |
+
|
| 33 |
+
# 2. 快速语音则缩短 silence 阈值
|
| 34 |
+
speed_factor = 1.0
|
| 35 |
+
if avg_speech < 300:
|
| 36 |
+
speed_factor = 0.5
|
| 37 |
+
elif avg_speech < 600:
|
| 38 |
+
speed_factor = 0.8
|
| 39 |
+
|
| 40 |
+
# 3. silence 的变化趋势也考虑进去
|
| 41 |
+
adaptive = self.base * speed_factor + 0.3 * avg_silence
|
| 42 |
+
|
| 43 |
+
return int(max(self.min, min(self.max, adaptive)))
|
| 44 |
|
| 45 |
|
| 46 |
class OnnxWrapper():
|
transcribe/pipelines/pipe_vad.py
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
|
| 2 |
from .base import MetaItem, BasePipe
|
| 3 |
-
from ..helpers.vadprocessor import FixedVADIterator,
|
| 4 |
|
| 5 |
import numpy as np
|
| 6 |
from silero_vad import get_speech_timestamps
|
| 7 |
from typing import List
|
| 8 |
import logging
|
| 9 |
-
|
| 10 |
# import noisereduce as nr
|
| 11 |
|
| 12 |
|
|
@@ -18,11 +18,16 @@ class VadPipe(BasePipe):
|
|
| 18 |
super().__init__(in_queue, out_queue)
|
| 19 |
self._offset = 0 # 处理的frame size offset
|
| 20 |
self._status = 'END'
|
| 21 |
-
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def reset(self):
|
| 24 |
self._offset = 0
|
| 25 |
self._status = 'END'
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
@classmethod
|
| 28 |
def init(cls):
|
|
@@ -48,32 +53,53 @@ class VadPipe(BasePipe):
|
|
| 48 |
relative_end_frame = None
|
| 49 |
start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
|
| 50 |
if start_frame:
|
| 51 |
-
relative_start_frame =
|
| 52 |
if end_frame:
|
| 53 |
relative_end_frame = max(0, end_frame - self._offset)
|
| 54 |
return relative_start_frame, relative_end_frame
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
def process(self, in_data: MetaItem) -> MetaItem:
|
| 57 |
if self._offset == 0:
|
| 58 |
self.vac.reset_states()
|
|
|
|
| 59 |
# silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
|
| 60 |
source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
|
| 61 |
speech_data = self._process_speech_chunk(source_audio)
|
| 62 |
|
| 63 |
if speech_data: # 表示有音频的变化点出现
|
| 64 |
-
|
|
|
|
| 65 |
if rel_start_frame is not None and rel_end_frame is None:
|
| 66 |
self._status = "START" # 语音开始
|
| 67 |
target_audio = source_audio[rel_start_frame:]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
|
| 69 |
elif rel_start_frame is None and rel_end_frame is not None:
|
| 70 |
self._status = "END" # 音频结束
|
| 71 |
target_audio = source_audio[:rel_end_frame]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
|
| 73 |
else:
|
| 74 |
self._status = 'END'
|
| 75 |
target_audio = source_audio[rel_start_frame:rel_end_frame]
|
| 76 |
logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
# logging.debug("❌ No valid speech segment detected, setting status to END")
|
| 78 |
else:
|
| 79 |
if self._status == 'START':
|
|
|
|
| 1 |
|
| 2 |
from .base import MetaItem, BasePipe
|
| 3 |
+
from ..helpers.vadprocessor import FixedVADIterator, AdaptiveSilenceController
|
| 4 |
|
| 5 |
import numpy as np
|
| 6 |
from silero_vad import get_speech_timestamps
|
| 7 |
from typing import List
|
| 8 |
import logging
|
| 9 |
+
import time
|
| 10 |
# import noisereduce as nr
|
| 11 |
|
| 12 |
|
|
|
|
| 18 |
super().__init__(in_queue, out_queue)
|
| 19 |
self._offset = 0 # 处理的frame size offset
|
| 20 |
self._status = 'END'
|
| 21 |
+
self.last_state_change_offset = 0
|
| 22 |
+
self.adaptive_ctrl = AdaptiveSilenceController()
|
| 23 |
+
|
| 24 |
|
| 25 |
def reset(self):
|
| 26 |
self._offset = 0
|
| 27 |
self._status = 'END'
|
| 28 |
+
self.last_state_change_offset = 0
|
| 29 |
+
self.adaptive_ctrl = AdaptiveSilenceController()
|
| 30 |
+
self.vac.reset_states()
|
| 31 |
|
| 32 |
@classmethod
|
| 33 |
def init(cls):
|
|
|
|
| 53 |
relative_end_frame = None
|
| 54 |
start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
|
| 55 |
if start_frame:
|
| 56 |
+
relative_start_frame =start_frame - self._offset
|
| 57 |
if end_frame:
|
| 58 |
relative_end_frame = max(0, end_frame - self._offset)
|
| 59 |
return relative_start_frame, relative_end_frame
|
| 60 |
+
|
| 61 |
+
def update_silence_ms(self):
|
| 62 |
+
min_silence = self.adaptive_ctrl.get_adaptive_silence_ms()
|
| 63 |
+
logging.debug(f"🫠 update_silence_ms :{min_silence} ")
|
| 64 |
+
self.vac.min_silence_duration_ms = min_silence
|
| 65 |
+
|
| 66 |
def process(self, in_data: MetaItem) -> MetaItem:
|
| 67 |
if self._offset == 0:
|
| 68 |
self.vac.reset_states()
|
| 69 |
+
|
| 70 |
# silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
|
| 71 |
source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
|
| 72 |
speech_data = self._process_speech_chunk(source_audio)
|
| 73 |
|
| 74 |
if speech_data: # 表示有音频的变化点出现
|
| 75 |
+
self.update_silence_ms()
|
| 76 |
+
rel_start_frame, rel_end_frame = speech_data
|
| 77 |
if rel_start_frame is not None and rel_end_frame is None:
|
| 78 |
self._status = "START" # 语音开始
|
| 79 |
target_audio = source_audio[rel_start_frame:]
|
| 80 |
+
|
| 81 |
+
# 计算上一段静音长度
|
| 82 |
+
silence_len = (self._offset + rel_start_frame - self.last_state_change_offset) / self.sample_rate * 1000
|
| 83 |
+
self.adaptive_ctrl.update_silence(silence_len)
|
| 84 |
+
self.last_state_change_offset = self._offset + rel_start_frame
|
| 85 |
+
|
| 86 |
logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
|
| 87 |
elif rel_start_frame is None and rel_end_frame is not None:
|
| 88 |
self._status = "END" # 音频结束
|
| 89 |
target_audio = source_audio[:rel_end_frame]
|
| 90 |
+
|
| 91 |
+
speech_len = (rel_end_frame) / self.sample_rate * 1000
|
| 92 |
+
self.adaptive_ctrl.update_speech(speech_len)
|
| 93 |
+
self.last_state_change_offset = self._offset + rel_end_frame
|
| 94 |
logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
|
| 95 |
else:
|
| 96 |
self._status = 'END'
|
| 97 |
target_audio = source_audio[rel_start_frame:rel_end_frame]
|
| 98 |
logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
|
| 99 |
+
|
| 100 |
+
seg_len = (rel_end_frame - rel_start_frame) / self.sample_rate * 1000
|
| 101 |
+
self.adaptive_ctrl.update_speech(seg_len)
|
| 102 |
+
self.last_state_change_offset = self._offset + rel_end_frame
|
| 103 |
# logging.debug("❌ No valid speech segment detected, setting status to END")
|
| 104 |
else:
|
| 105 |
if self._status == 'START':
|