Xin Zhang commited on
Commit
e03f21e
·
1 Parent(s): 418e2a0

Update model and processor files

Browse files
moyoyo_asr_models/ggml-small.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f6ef171491de375b741059400ba9a0aead023122b7a7db731b4943f9baa0f97
3
  size 487601984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:951596a31b1c96a01b7a2b1bc511f665d900c679126134f6ec18db5ec4a485fe
3
  size 487601984
transcribe/helpers/vadprocessor.py CHANGED
@@ -137,7 +137,7 @@ class VADIteratorOnnx:
137
  return_seconds: bool (default - False)
138
  whether return timestamps in seconds (default - samples)
139
  """
140
-
141
  window_size_samples = 512 if self.sampling_rate == 16000 else 256
142
  x = x[:window_size_samples]
143
  if len(x) < window_size_samples:
@@ -156,7 +156,7 @@ class VADIteratorOnnx:
156
  speech_start = max(0, self.current_sample - window_size_samples)
157
  self.start = speech_start
158
  return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
159
-
160
  if (speech_prob >= self.threshold) and self.current_sample - self.start >= self.max_speech_samples:
161
  if self.temp_end:
162
  self.temp_end = 0
@@ -175,7 +175,7 @@ class VADIteratorOnnx:
175
  return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
176
 
177
  return None
178
-
179
 
180
 
181
  class VadV2:
@@ -267,15 +267,15 @@ class VadV2:
267
 
268
  return result
269
  return None
270
-
271
 
272
-
 
273
  class VadProcessor:
274
  def __init__(
275
  self,
276
  prob_threshold=0.5,
277
- silence_s=0.3,
278
- cache_s=0.25,
279
  sr=16000
280
  ):
281
  self.prob_thres = prob_threshold
@@ -284,7 +284,7 @@ class VadProcessor:
284
  self.silence_s = silence_s
285
 
286
  self.vad = VadV2(self.prob_thres, self.sr, self.silence_s * 1000, self.cache_s * 1000, max_speech_duration_s=15)
287
-
288
 
289
  def process_audio(self, audio_buffer: np.ndarray):
290
  audio = np.array([], np.float32)
 
137
  return_seconds: bool (default - False)
138
  whether return timestamps in seconds (default - samples)
139
  """
140
+
141
  window_size_samples = 512 if self.sampling_rate == 16000 else 256
142
  x = x[:window_size_samples]
143
  if len(x) < window_size_samples:
 
156
  speech_start = max(0, self.current_sample - window_size_samples)
157
  self.start = speech_start
158
  return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
159
+
160
  if (speech_prob >= self.threshold) and self.current_sample - self.start >= self.max_speech_samples:
161
  if self.temp_end:
162
  self.temp_end = 0
 
175
  return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
176
 
177
  return None
178
+
179
 
180
 
181
  class VadV2:
 
267
 
268
  return result
269
  return None
 
270
 
271
+
272
+
273
  class VadProcessor:
274
  def __init__(
275
  self,
276
  prob_threshold=0.5,
277
+ silence_s=0.2,
278
+ cache_s=0.15,
279
  sr=16000
280
  ):
281
  self.prob_thres = prob_threshold
 
284
  self.silence_s = silence_s
285
 
286
  self.vad = VadV2(self.prob_thres, self.sr, self.silence_s * 1000, self.cache_s * 1000, max_speech_duration_s=15)
287
+
288
 
289
  def process_audio(self, audio_buffer: np.ndarray):
290
  audio = np.array([], np.float32)