daihui.zhang commited on
Commit
aca5e0b
·
1 Parent(s): 2d75b7c

add vad update_silence_ms adapter

Browse files
transcribe/helpers/vadprocessor.py CHANGED
@@ -9,6 +9,38 @@ import logging
9
  from datetime import timedelta
10
  import gc
11
  from pydub import AudioSegment
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
 
14
  class OnnxWrapper():
 
9
  from datetime import timedelta
10
  import gc
11
  from pydub import AudioSegment
12
+ from collections import deque
13
+
14
+ class AdaptiveSilenceController:
15
+ def __init__(self, base_silence_ms=120, min_ms=50, max_ms=600):
16
+ self.base = base_silence_ms
17
+ self.min = min_ms
18
+ self.max = max_ms
19
+ self.recent_silences = deque(maxlen=20)
20
+ self.recent_speeches = deque(maxlen=20)
21
+
22
+ def update_silence(self, duration_ms):
23
+ self.recent_silences.append(duration_ms)
24
+
25
+ def update_speech(self, duration_ms):
26
+ self.recent_speeches.append(duration_ms)
27
+
28
+ def get_adaptive_silence_ms(self):
29
+ # 1. 快速说话特征:平均语音段长度短(如 < 250ms)
30
+ avg_speech = np.mean(self.recent_speeches) if self.recent_speeches else self.base
31
+ avg_silence = np.mean(self.recent_silences) if self.recent_silences else self.base
32
+
33
+ # 2. 快速语音则缩短 silence 阈值
34
+ speed_factor = 1.0
35
+ if avg_speech < 300:
36
+ speed_factor = 0.5
37
+ elif avg_speech < 600:
38
+ speed_factor = 0.8
39
+
40
+ # 3. silence 的变化趋势也考虑进去
41
+ adaptive = self.base * speed_factor + 0.3 * avg_silence
42
+
43
+ return int(max(self.min, min(self.max, adaptive)))
44
 
45
 
46
  class OnnxWrapper():
transcribe/pipelines/pipe_vad.py CHANGED
@@ -1,12 +1,12 @@
1
 
2
  from .base import MetaItem, BasePipe
3
- from ..helpers.vadprocessor import FixedVADIterator, SileroVADProcessor
4
 
5
  import numpy as np
6
  from silero_vad import get_speech_timestamps
7
  from typing import List
8
  import logging
9
-
10
  # import noisereduce as nr
11
 
12
 
@@ -18,11 +18,16 @@ class VadPipe(BasePipe):
18
  super().__init__(in_queue, out_queue)
19
  self._offset = 0 # 处理的frame size offset
20
  self._status = 'END'
21
-
 
 
22
 
23
  def reset(self):
24
  self._offset = 0
25
  self._status = 'END'
 
 
 
26
 
27
  @classmethod
28
  def init(cls):
@@ -48,32 +53,53 @@ class VadPipe(BasePipe):
48
  relative_end_frame = None
49
  start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
50
  if start_frame:
51
- relative_start_frame = start_frame - self._offset
52
  if end_frame:
53
  relative_end_frame = max(0, end_frame - self._offset)
54
  return relative_start_frame, relative_end_frame
55
-
 
 
 
 
 
56
  def process(self, in_data: MetaItem) -> MetaItem:
57
  if self._offset == 0:
58
  self.vac.reset_states()
 
59
  # silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
60
  source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
61
  speech_data = self._process_speech_chunk(source_audio)
62
 
63
  if speech_data: # 表示有音频的变化点出现
64
- rel_start_frame, rel_end_frame = speech_data
 
65
  if rel_start_frame is not None and rel_end_frame is None:
66
  self._status = "START" # 语音开始
67
  target_audio = source_audio[rel_start_frame:]
 
 
 
 
 
 
68
  logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
69
  elif rel_start_frame is None and rel_end_frame is not None:
70
  self._status = "END" # 音频结束
71
  target_audio = source_audio[:rel_end_frame]
 
 
 
 
72
  logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
73
  else:
74
  self._status = 'END'
75
  target_audio = source_audio[rel_start_frame:rel_end_frame]
76
  logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
 
 
 
 
77
  # logging.debug("❌ No valid speech segment detected, setting status to END")
78
  else:
79
  if self._status == 'START':
 
1
 
2
  from .base import MetaItem, BasePipe
3
+ from ..helpers.vadprocessor import FixedVADIterator, AdaptiveSilenceController
4
 
5
  import numpy as np
6
  from silero_vad import get_speech_timestamps
7
  from typing import List
8
  import logging
9
+ import time
10
  # import noisereduce as nr
11
 
12
 
 
18
  super().__init__(in_queue, out_queue)
19
  self._offset = 0 # 处理的frame size offset
20
  self._status = 'END'
21
+ self.last_state_change_offset = 0
22
+ self.adaptive_ctrl = AdaptiveSilenceController()
23
+
24
 
25
  def reset(self):
26
  self._offset = 0
27
  self._status = 'END'
28
+ self.last_state_change_offset = 0
29
+ self.adaptive_ctrl = AdaptiveSilenceController()
30
+ self.vac.reset_states()
31
 
32
  @classmethod
33
  def init(cls):
 
53
  relative_end_frame = None
54
  start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
55
  if start_frame:
56
+ relative_start_frame =start_frame - self._offset
57
  if end_frame:
58
  relative_end_frame = max(0, end_frame - self._offset)
59
  return relative_start_frame, relative_end_frame
60
+
61
+ def update_silence_ms(self):
62
+ min_silence = self.adaptive_ctrl.get_adaptive_silence_ms()
63
+ logging.debug(f"🫠 update_silence_ms :{min_silence} ")
64
+ self.vac.min_silence_duration_ms = min_silence
65
+
66
  def process(self, in_data: MetaItem) -> MetaItem:
67
  if self._offset == 0:
68
  self.vac.reset_states()
69
+
70
  # silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
71
  source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
72
  speech_data = self._process_speech_chunk(source_audio)
73
 
74
  if speech_data: # 表示有音频的变化点出现
75
+ self.update_silence_ms()
76
+ rel_start_frame, rel_end_frame = speech_data
77
  if rel_start_frame is not None and rel_end_frame is None:
78
  self._status = "START" # 语音开始
79
  target_audio = source_audio[rel_start_frame:]
80
+
81
+ # 计算上一段静音长度
82
+ silence_len = (self._offset + rel_start_frame - self.last_state_change_offset) / self.sample_rate * 1000
83
+ self.adaptive_ctrl.update_silence(silence_len)
84
+ self.last_state_change_offset = self._offset + rel_start_frame
85
+
86
  logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
87
  elif rel_start_frame is None and rel_end_frame is not None:
88
  self._status = "END" # 音频结束
89
  target_audio = source_audio[:rel_end_frame]
90
+
91
+ speech_len = (rel_end_frame) / self.sample_rate * 1000
92
+ self.adaptive_ctrl.update_speech(speech_len)
93
+ self.last_state_change_offset = self._offset + rel_end_frame
94
  logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
95
  else:
96
  self._status = 'END'
97
  target_audio = source_audio[rel_start_frame:rel_end_frame]
98
  logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
99
+
100
+ seg_len = (rel_end_frame - rel_start_frame) / self.sample_rate * 1000
101
+ self.adaptive_ctrl.update_speech(seg_len)
102
+ self.last_state_change_offset = self._offset + rel_end_frame
103
  # logging.debug("❌ No valid speech segment detected, setting status to END")
104
  else:
105
  if self._status == 'START':