daihui.zhang commited on
Commit
ca5d527
·
1 Parent(s): 484b9cf

update to vad streaming

Browse files
transcribe/helpers/vadprocessor.py CHANGED
@@ -2,10 +2,15 @@ from copy import deepcopy
2
  from queue import Queue, Empty
3
  from time import time
4
  from config import VAD_MODEL_PATH
5
- # from silero_vad import load_silero_vad
6
  import numpy as np
7
  import onnxruntime
 
 
 
 
8
 
 
9
  class OnnxWrapper():
10
 
11
  def __init__(self, path, force_onnx_cpu=False):
@@ -178,6 +183,33 @@ class VADIteratorOnnx:
178
 
179
 
180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  class VadV2:
182
  def __init__(self,
183
  threshold: float = 0.5,
@@ -269,6 +301,235 @@ class VadV2:
269
  return None
270
 
271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
  class VadProcessor:
274
  def __init__(
 
2
  from queue import Queue, Empty
3
  from time import time
4
  from config import VAD_MODEL_PATH
5
+ from silero_vad import load_silero_vad
6
  import numpy as np
7
  import onnxruntime
8
+ import logging
9
+ from datetime import timedelta
10
+ import gc
11
+ from pydub import AudioSegment
12
 
13
+
14
  class OnnxWrapper():
15
 
16
  def __init__(self, path, force_onnx_cpu=False):
 
183
 
184
 
185
 
186
+
187
+ class FixedVADIterator(VADIteratorOnnx):
188
+ '''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
189
+ If audio to be processed at once is long and multiple voiced segments detected,
190
+ then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
191
+ '''
192
+
193
+ def reset_states(self):
194
+ super().reset_states()
195
+ self.buffer = np.array([],dtype=np.float32)
196
+
197
+ def __call__(self, x, return_seconds=False):
198
+ self.buffer = np.append(self.buffer, x)
199
+ ret = None
200
+ while len(self.buffer) >= 512:
201
+ r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
202
+ self.buffer = self.buffer[512:]
203
+ if ret is None:
204
+ ret = r
205
+ elif r is not None:
206
+ if 'end' in r:
207
+ ret['end'] = r['end'] # the latter end
208
+ if 'start' in r and 'end' in ret: # there is an earlier start.
209
+ # Remove end, merging this segment with the previous one.
210
+ del ret['end']
211
+ return ret if ret != {} else None
212
+
213
  class VadV2:
214
  def __init__(self,
215
  threshold: float = 0.5,
 
301
  return None
302
 
303
 
304
+ class SileroVADProcessor:
305
+ """
306
+ A class for processing audio files using Silero VAD to detect voice activity
307
+ and extract voice segments from audio files.
308
+ """
309
+
310
+ def __init__(self,
311
+ activate_threshold=0.5,
312
+ fusion_threshold=0.3,
313
+ min_speech_duration=0.25,
314
+ max_speech_duration=20,
315
+ min_silence_duration=250,
316
+ sample_rate=16000,
317
+ ort_providers=None):
318
+ """
319
+ Initialize the SileroVADProcessor.
320
+ Args:
321
+ activate_threshold (float): Threshold for voice activity detection
322
+ fusion_threshold (float): Threshold for merging close speech segments (seconds)
323
+ min_speech_duration (float): Minimum duration of speech to be considered valid (seconds)
324
+ max_speech_duration (float): Maximum duration of speech (seconds)
325
+ min_silence_duration (int): Minimum silence duration (ms)
326
+ sample_rate (int): Sample rate of the audio (8000 or 16000 Hz)
327
+ ort_providers (list): ONNX Runtime providers for acceleration
328
+ """
329
+ # VAD parameters
330
+ self.activate_threshold = activate_threshold
331
+ self.fusion_threshold = fusion_threshold
332
+ self.min_speech_duration = min_speech_duration
333
+ self.max_speech_duration = max_speech_duration
334
+ self.min_silence_duration = min_silence_duration
335
+ self.sample_rate = sample_rate
336
+ self.ort_providers = ort_providers if ort_providers else []
337
+
338
+ # Initialize logger
339
+ self.logger = logging.getLogger(__name__)
340
+
341
+ # Load Silero VAD model
342
+ self._init_onnx_session()
343
+ self.silero_vad = load_silero_vad(onnx=True)
344
+
345
+ def _init_onnx_session(self):
346
+ """Initialize ONNX Runtime session with appropriate settings."""
347
+ session_opts = onnxruntime.SessionOptions()
348
+ session_opts.log_severity_level = 3
349
+ session_opts.inter_op_num_threads = 0
350
+ session_opts.intra_op_num_threads = 0
351
+ session_opts.enable_cpu_mem_arena = True
352
+ session_opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
353
+ session_opts.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
354
+
355
+ session_opts.add_session_config_entry("session.intra_op.allow_spinning", "1")
356
+ session_opts.add_session_config_entry("session.inter_op.allow_spinning", "1")
357
+ session_opts.add_session_config_entry("session.set_denormal_as_zero", "1")
358
+
359
+ # Set the session_opts to be used by silero_vad
360
+ # onnxruntime.capi._pybind_state.get_default_session_options(session_opts)
361
+
362
+ def load_audio(self, audio_path):
363
+ """
364
+ Load audio file and prepare it for VAD processing.
365
+ Args:
366
+ audio_path (str): Path to the audio file
367
+ Returns:
368
+ numpy.ndarray: Audio data as numpy array
369
+ """
370
+ self.logger.info(f"Loading audio from {audio_path}")
371
+ audio_segment = AudioSegment.from_file(audio_path)
372
+ audio_segment = audio_segment.set_channels(1).set_frame_rate(self.sample_rate)
373
+
374
+ # Convert to numpy array and normalize
375
+ dtype = np.float16 if self.use_gpu_fp16 else np.float32
376
+ audio_array = np.array(audio_segment.get_array_of_samples(), dtype=dtype) * 0.000030517578 # 1/32768
377
+
378
+ self.audio_segment = audio_segment # Store for later use
379
+ return audio_array
380
+
381
+ @property
382
+ def model(self):
383
+ return self.silero_vad
384
+
385
+ def process_timestamps(self, timestamps):
386
+ """
387
+ Process VAD timestamps: filter short segments and merge close segments.
388
+ Args:
389
+ timestamps (list): List of (start, end) tuples
390
+ Returns:
391
+ list: Processed list of (start, end) tuples
392
+ """
393
+ # Filter out short durations
394
+ filtered_timestamps = [(start, end) for start, end in timestamps
395
+ if (end - start) >= self.min_speech_duration]
396
+
397
+ # Fuse timestamps in two passes for better merging
398
+ fused_timestamps_1st = []
399
+ for start, end in filtered_timestamps:
400
+ if fused_timestamps_1st and (start - fused_timestamps_1st[-1][1] <= self.fusion_threshold):
401
+ fused_timestamps_1st[-1] = (fused_timestamps_1st[-1][0], end)
402
+ else:
403
+ fused_timestamps_1st.append((start, end))
404
+
405
+ fused_timestamps_2nd = []
406
+ for start, end in fused_timestamps_1st:
407
+ if fused_timestamps_2nd and (start - fused_timestamps_2nd[-1][1] <= self.fusion_threshold):
408
+ fused_timestamps_2nd[-1] = (fused_timestamps_2nd[-1][0], end)
409
+ else:
410
+ fused_timestamps_2nd.append((start, end))
411
+
412
+ return fused_timestamps_2nd
413
+
414
+ def format_time(self, seconds):
415
+ """
416
+ Convert seconds to VTT time format 'hh:mm:ss.mmm'.
417
+ Args:
418
+ seconds (float): Time in seconds
419
+ Returns:
420
+ str: Formatted time string
421
+ """
422
+ td = timedelta(seconds=seconds)
423
+ td_sec = td.total_seconds()
424
+ total_seconds = int(td_sec)
425
+ milliseconds = int((td_sec - total_seconds) * 1000)
426
+ hours = total_seconds // 3600
427
+ minutes = (total_seconds % 3600) // 60
428
+ seconds = total_seconds % 60
429
+ return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
430
+
431
+ def detect_speech(self, audio:np.array):
432
+ """
433
+ Run VAD on the audio file to detect speech segments.
434
+ Args:
435
+ audio_path (str): Path to the audio file
436
+ Returns:
437
+ list: List of processed timestamps as (start, end) tuples
438
+ """
439
+ self.logger.info("Starting VAD process")
440
+ start_time = time.time()
441
+ # Get speech timestamps
442
+ raw_timestamps = get_speech_timestamps(
443
+ audio,
444
+ model=self.silero_vad,
445
+ threshold=self.activate_threshold,
446
+ max_speech_duration_s=self.max_speech_duration,
447
+ min_speech_duration_ms=int(self.min_speech_duration * 1000),
448
+ min_silence_duration_ms=self.min_silence_duration,
449
+ return_seconds=True
450
+ )
451
+
452
+ # Convert to simple format and process
453
+ timestamps = [(item['start'], item['end']) for item in raw_timestamps]
454
+ processed_timestamps = self.process_timestamps(timestamps)
455
+
456
+ # Clean up
457
+ del audio
458
+ gc.collect()
459
+
460
+ self.logger.info(f"VAD completed in {time.time() - start_time:.3f} seconds")
461
+ return processed_timestamps
462
+
463
+ """
464
+ Save timestamps in both second and sample indices formats.
465
+ Args:
466
+ timestamps (list): List of (start, end) tuples
467
+ output_prefix (str): Prefix for output files
468
+ """
469
+ # Save timestamps in seconds (VTT format)
470
+ seconds_path = f"{output_prefix}_timestamps_second.txt"
471
+ with open(seconds_path, "w", encoding='UTF-8') as file:
472
+ self.logger.info("Saving timestamps in seconds format")
473
+ for start, end in timestamps:
474
+ s_time = self.format_time(start)
475
+ e_time = self.format_time(end)
476
+ line = f"{s_time} --> {e_time}\n"
477
+ file.write(line)
478
+
479
+ # Save timestamps in sample indices
480
+ indices_path = f"{output_prefix}_timestamps_indices.txt"
481
+ with open(indices_path, "w", encoding='UTF-8') as file:
482
+ self.logger.info("Saving timestamps in indices format")
483
+ for start, end in timestamps:
484
+ line = f"{int(start * self.sample_rate)} --> {int(end * self.sample_rate)}\n"
485
+ file.write(line)
486
+
487
+ self.logger.info(f"Timestamps saved to {seconds_path} and {indices_path}")
488
+
489
+ def extract_speech_segments(self, audio_segment, timestamps):
490
+ """
491
+ Extract speech segments from the audio and combine them into a single audio file.
492
+ Args:
493
+ timestamps (list): List of (start, end) tuples indicating speech segments
494
+ Returns:
495
+ AudioSegment: The combined speech segments
496
+ """
497
+ audio_segment = audio_segment.numpy()
498
+ combined_speech = np.array([], dtype=np.float32)
499
+
500
+ # Extract and combine each speech segment
501
+ for i, (start, end) in enumerate(timestamps):
502
+ # Convert seconds to milliseconds for pydub
503
+ start_ms = int(start * 1000)
504
+ end_ms = int(end * 1000)
505
+
506
+ # Ensure the end time does not exceed the length of the audio segment
507
+ if end_ms > len(audio_segment):
508
+ end_ms = len(audio_segment)
509
+
510
+ # Extract the segment
511
+ segment = audio_segment[start_ms:end_ms]
512
+
513
+ # Add to combined audio
514
+ combined_speech = np.append(combined_speech, segment)
515
+
516
+ return combined_speech
517
+
518
+ def process_audio(self, audio_array:np.array):
519
+ """
520
+ Complete processing pipeline: detect speech, save timestamps, and optionally extract speech.
521
+ Returns:
522
+ tuple: (timestamps, output_speech_path if extract_speech else None)
523
+ """
524
+
525
+ # Run VAD to detect speech
526
+ timestamps = self.detect_speech(audio_array)
527
+
528
+ combined_speech = self.extract_speech_segments(audio_array, timestamps)
529
+
530
+ return timestamps, combined_speech
531
+
532
+
533
 
534
  class VadProcessor:
535
  def __init__(
transcribe/pipelines/base.py CHANGED
@@ -22,6 +22,7 @@ class MetaItem:
22
  translate_content: str = ''
23
  source_language: str = 'zh'
24
  destination_language: str = 'en'
 
25
 
26
 
27
  class BasePipe(Process):
 
22
  translate_content: str = ''
23
  source_language: str = 'zh'
24
  destination_language: str = 'en'
25
+ speech_status: str = 'END' # "END", "START"
26
 
27
 
28
  class BasePipe(Process):
transcribe/pipelines/pipe_vad.py CHANGED
@@ -1,6 +1,7 @@
1
 
2
  from .base import MetaItem, BasePipe
3
- from ..helpers.vadprocessor import VadV2
 
4
  import numpy as np
5
  from silero_vad import get_speech_timestamps
6
  from typing import List
@@ -12,30 +13,83 @@ import logging
12
  class VadPipe(BasePipe):
13
  vac = None
14
  sample_rate = 16000
15
- window_size_samples = 512
16
- chunk_size = 512
17
- prob_threshold=0.5,
18
- silence_s=0.5,
19
- cache_s=0.25,
20
-
21
 
 
 
 
 
 
22
 
 
 
 
 
23
  @classmethod
24
  def init(cls):
25
  if cls.vac is None:
26
- cls.vac = VadV2(cls.prob_threshold, cls.sample_rate, cls.silence_s * 1000, cls.cache_s * 1000, max_speech_duration_s=15)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  def process(self, in_data: MetaItem) -> MetaItem:
29
- audio_buffer = np.frombuffer(in_data.source_audio)
30
- vad_audio = self.vac(audio_buffer)
31
- if vad_audio:
32
- in_data.audio = vad_audio['audio']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  else:
34
- in_data.audio = b""
35
- return in_data
36
-
 
 
 
 
37
 
38
- # def reduce_noise(self, data):
39
- # return nr.reduce_noise(y=data, sr=self.sample_rate)
40
 
41
-
 
 
 
 
1
 
2
  from .base import MetaItem, BasePipe
3
+ from ..helpers.vadprocessor import FixedVADIterator, SileroVADProcessor
4
+
5
  import numpy as np
6
  from silero_vad import get_speech_timestamps
7
  from typing import List
 
13
  class VadPipe(BasePipe):
14
  vac = None
15
  sample_rate = 16000
 
 
 
 
 
 
16
 
17
+ def __init__(self, in_queue=None, out_queue=None) -> None:
18
+ super().__init__(in_queue, out_queue)
19
+ self._offset = 0 # 处理的frame size offset
20
+ self._status = 'END'
21
+
22
 
23
+ def reset(self):
24
+ self._offset = 0
25
+ self._status = 'END'
26
+
27
  @classmethod
28
  def init(cls):
29
  if cls.vac is None:
30
+ cls.vac = FixedVADIterator(
31
+ threshold=0.3,
32
+ sampling_rate=cls.sample_rate,
33
+ # speech_pad_ms=10
34
+ min_silence_duration_ms = 100,
35
+ # speech_pad_ms = 30,
36
+ max_speech_duration_s=15
37
+ )
38
+ cls.vac.reset_states()
39
+
40
+
41
+ # def reduce_noise(self, data):
42
+ # return nr.reduce_noise(y=data, sr=self.sample_rate)
43
+
44
+ def _process_speech_chunk(self, source_audio:np.ndarray):
45
+ speech_dict = self.vac(source_audio, return_seconds=False)
46
+ if speech_dict:
47
+ relative_start_frame = None
48
+ relative_end_frame = None
49
+ start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
50
+ if start_frame:
51
+ relative_start_frame = start_frame - self._offset
52
+ if end_frame:
53
+ relative_end_frame = end_frame - self._offset
54
+ return relative_start_frame, relative_end_frame
55
 
56
  def process(self, in_data: MetaItem) -> MetaItem:
57
+ if self._offset == 0:
58
+ self.vac.reset_states()
59
+ # silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
60
+ source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
61
+ speech_data = self._process_speech_chunk(source_audio)
62
+
63
+ if speech_data: # 表示有音频的变化点出现
64
+ rel_start_frame, rel_end_frame = speech_data
65
+ if rel_start_frame and not rel_end_frame:
66
+ self._status = "START" # 语音开始
67
+ target_audio = source_audio[rel_start_frame:]
68
+ logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
69
+ elif not rel_start_frame and rel_end_frame:
70
+ self._status = "END" # 音频结束
71
+ target_audio = source_audio[:rel_end_frame]
72
+ logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
73
+ elif rel_start_frame and rel_end_frame:
74
+ self._status = 'END'
75
+ target_audio = source_audio[rel_start_frame:rel_end_frame]
76
+ logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
77
+ else:
78
+ self._status = 'END'
79
+ target_audio = np.array([],dtype=np.float32)
80
+ # logging.debug("❌ No valid speech segment detected, setting status to END")
81
  else:
82
+ if self._status == 'START':
83
+ target_audio = source_audio
84
+ # logging.debug("🔊 Continuing to capture audio as speech is still ongoing")
85
+ else: # end
86
+ target_audio = np.array([],dtype=np.float32)
87
+ # self._status = 'END'
88
+ # logging.debug("❌ No speech detected, setting status to END")
89
 
90
+ self._offset += len(source_audio)
 
91
 
92
+ in_data.audio = target_audio.tobytes()
93
+ in_data.source_audio = b''
94
+ in_data.speech_status = self._status
95
+ return in_data
transcribe/translatepipes.py CHANGED
@@ -1,4 +1,4 @@
1
- from transcribe.pipelines import WhisperPipe, MetaItem, WhisperChinese, Translate7BPipe, FunASRPipe
2
 
3
 
4
  class TranslatePipes:
@@ -9,7 +9,7 @@ class TranslatePipes:
9
  self._process = []
10
  # whisper 转录
11
  self._whisper_pipe_en = self._launch_process(WhisperPipe())
12
- self._whisper_pipe_zh = self._launch_process(WhisperChinese())
13
  self._funasr_pipe = self._launch_process(FunASRPipe())
14
 
15
  # llm 翻译
@@ -17,7 +17,7 @@ class TranslatePipes:
17
 
18
  self._translate_7b_pipe = self._launch_process(Translate7BPipe())
19
  # vad
20
- # self._vad_pipe = self._launch_process(VadPipe())
21
 
22
  # def reset(self):
23
  # self._vad_pipe.reset()
 
1
+ from transcribe.pipelines import WhisperPipe, MetaItem, WhisperChinese, Translate7BPipe, FunASRPipe, VadPipe
2
 
3
 
4
  class TranslatePipes:
 
9
  self._process = []
10
  # whisper 转录
11
  self._whisper_pipe_en = self._launch_process(WhisperPipe())
12
+ # self._whisper_pipe_zh = self._launch_process(WhisperChinese())
13
  self._funasr_pipe = self._launch_process(FunASRPipe())
14
 
15
  # llm 翻译
 
17
 
18
  self._translate_7b_pipe = self._launch_process(Translate7BPipe())
19
  # vad
20
+ self._vad_pipe = self._launch_process(VadPipe())
21
 
22
  # def reset(self):
23
  # self._vad_pipe.reset()
transcribe/whisper_llm_serve.py CHANGED
@@ -8,14 +8,14 @@ from typing import List, Optional, Iterator, Tuple, Any
8
  import asyncio
9
  import numpy as np
10
  import config
11
-
12
  from api_model import TransResult, Message, DebugResult
13
 
14
  from .utils import log_block, save_to_wave, TestDataWriter
15
  from .translatepipes import TranslatePipes
16
  from .strategy import (
17
  TranscriptStabilityAnalyzer, TranscriptToken)
18
- from transcribe.helpers.vadprocessor import VadProcessor
19
  from transcribe.pipelines import MetaItem
20
 
21
  logger = getLogger("TranscriptionService")
@@ -43,13 +43,18 @@ class WhisperTranscriptionService:
43
  self.sample_rate = 16000
44
 
45
  self.lock = threading.Lock()
46
- self._frame_queue = queue.Queue()
47
- self._vad_frame_queue = queue.Queue()
48
 
49
  # 文本分隔符,根据语言设置
50
  self.text_separator = self._get_text_separator(language)
51
  self.loop = asyncio.get_event_loop()
52
  # 发送就绪状态
 
 
 
 
 
 
53
 
54
  self._transcrible_analysis = None
55
  # 启动处理线程
@@ -58,10 +63,10 @@ class WhisperTranscriptionService:
58
 
59
  self.translate_thread = self._start_thread(self._transcription_processing_loop)
60
  self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
61
- if language == "zh":
62
- self._vad = VadProcessor(prob_threshold=0.8, silence_s=0.2, cache_s=0.15)
63
- else:
64
- self._vad = VadProcessor(prob_threshold=0.7, silence_s=0.2, cache_s=0.15)
65
  self.row_number = 0
66
  # for test
67
  self._transcrible_time_cost = 0.
@@ -111,24 +116,94 @@ class WhisperTranscriptionService:
111
  """添加音频帧到处理队列"""
112
  self._frame_queue.put(frame_np)
113
 
 
 
 
 
 
 
 
114
  def _frame_processing_loop(self) -> None:
115
  """从队列获取音频帧并合并到缓冲区"""
116
  while not self._frame_processing_thread_stop.is_set():
117
  try:
118
- audio = self._frame_queue.get(timeout=0.1)
119
- # save_to_wave(f"{self._c}_before_vad.wav", audio)
120
- processed_audio = self._vad.process_audio(audio)
121
- if processed_audio.shape[0] > 0:
122
- # vad_processed_audio = processed_audio
123
- # save_to_wave(f"{self._c}_after_vad.wav", processed_audio)
124
- # vad_frame_obj = np.frombuffer(processed_audio.audio, dtype=np.float32)
125
- logger.debug(f"Vad frame: {processed_audio.shape[0]/self.sample_rate:.2f}")
126
- # apply vad speech check:
127
- self._vad_frame_queue.put(processed_audio)
 
 
128
  except queue.Empty:
129
  pass
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
 
132
  def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
133
  """转录音频并返回转录片段"""
134
  log_block("Audio buffer length", f"{audio_buffer.shape[0]/self.sample_rate:.2f}", "s")
@@ -176,47 +251,7 @@ class WhisperTranscriptionService:
176
  self._translate_time_cost = round(time_diff, 3)
177
  return translated_text
178
 
179
- def _transcription_processing_loop(self) -> None:
180
- """主转录处理循环"""
181
-
182
- while not self._translate_thread_stop.is_set():
183
- audio_buffer = self._vad_frame_queue.get()
184
- if audio_buffer is None:
185
- time.sleep(0.2)
186
- continue
187
- if len(audio_buffer) < int(self.sample_rate):
188
- silence_audio = np.zeros(self.sample_rate, dtype=np.float32)
189
- silence_audio[-len(audio_buffer):] = audio_buffer
190
- audio_buffer = silence_audio
191
 
192
- logger.debug(f"audio buffer size: {len(audio_buffer) / self.sample_rate:.2f}s")
193
- # try:
194
- meta_item = self._transcribe_audio(audio_buffer)
195
- segments = meta_item.segments
196
- logger.debug(f"Segments: {segments}")
197
- if len(segments):
198
- result = self._process_transcription_results_2(segments)
199
- self._send_result_to_client(result)
200
- time.sleep(0.1)
201
- # 处理转录结果并发送到客户端
202
- # for result in self._process_transcription_results(segments, audio_buffer):
203
- # self._send_result_to_client(result)
204
-
205
- # except Exception as e:
206
- # logger.error(f"Error processing audio: {e}")
207
-
208
- def _process_transcription_results_2(self, segments: List[TranscriptToken],):
209
- seg_text = self.text_separator.join(seg.text for seg in segments)
210
- item = TransResult(
211
- seg_id=self.row_number,
212
- context=seg_text,
213
- from_=self.source_language,
214
- to=self.target_language,
215
- tran_content=self._translate_text_large(seg_text),
216
- partial=False
217
- )
218
- self.row_number += 1
219
- return item
220
 
221
  def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
222
  """
 
8
  import asyncio
9
  import numpy as np
10
  import config
11
+ import collections
12
  from api_model import TransResult, Message, DebugResult
13
 
14
  from .utils import log_block, save_to_wave, TestDataWriter
15
  from .translatepipes import TranslatePipes
16
  from .strategy import (
17
  TranscriptStabilityAnalyzer, TranscriptToken)
18
+ # from transcribe.helpers.vadprocessor import VadProcessor
19
  from transcribe.pipelines import MetaItem
20
 
21
  logger = getLogger("TranscriptionService")
 
43
  self.sample_rate = 16000
44
 
45
  self.lock = threading.Lock()
46
+
 
47
 
48
  # 文本分隔符,根据语言设置
49
  self.text_separator = self._get_text_separator(language)
50
  self.loop = asyncio.get_event_loop()
51
  # 发送就绪状态
52
+ # 原始音频队列
53
+ self._frame_queue = queue.Queue()
54
+ # 音频队列缓冲区
55
+ self.frames_np = None
56
+ # 完整音频队列
57
+ self.segments_queue = collections.deque()
58
 
59
  self._transcrible_analysis = None
60
  # 启动处理线程
 
63
 
64
  self.translate_thread = self._start_thread(self._transcription_processing_loop)
65
  self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
66
+ # if language == "zh":
67
+ # self._vad = VadProcessor(prob_threshold=0.8, silence_s=0.2, cache_s=0.15)
68
+ # else:
69
+ # self._vad = VadProcessor(prob_threshold=0.7, silence_s=0.2, cache_s=0.15)
70
  self.row_number = 0
71
  # for test
72
  self._transcrible_time_cost = 0.
 
116
  """添加音频帧到处理队列"""
117
  self._frame_queue.put(frame_np)
118
 
119
+ def _apply_voice_activity_detection(self, frame_np:np.array):
120
+ """应用语音活动检测来优化音频缓冲区"""
121
+ processed_audio = self._translate_pipe.voice_detect(frame_np.tobytes())
122
+ speech_audio = np.frombuffer(processed_audio.audio, dtype=np.float32)
123
+ speech_status = processed_audio.speech_status
124
+ return speech_audio, speech_status
125
+
126
  def _frame_processing_loop(self) -> None:
127
  """从队列获取音频帧并合并到缓冲区"""
128
  while not self._frame_processing_thread_stop.is_set():
129
  try:
130
+ frame_np = self._frame_queue.get(timeout=0.1)
131
+ frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
132
+ if frame_np is None:
133
+ continue
134
+ with self.lock:
135
+ if self.frames_np is None:
136
+ self.frames_np = frame_np.copy()
137
+ else:
138
+ self.frames_np = np.append(self.frames_np, frame_np)
139
+ if speech_status == "END" and len(self.frames_np) > 0:
140
+ self.segments_queue.appendleft(self.frames_np.copy())
141
+ self.frames_np = np.array([], dtype=np.float32)
142
  except queue.Empty:
143
  pass
144
 
145
+ def _process_transcription_results_2(self, segments: List[TranscriptToken],partial):
146
+ seg_text = self.text_separator.join(seg.text for seg in segments)
147
+ item = TransResult(
148
+ seg_id=self.row_number,
149
+ context=seg_text,
150
+ from_=self.source_language,
151
+ to=self.target_language,
152
+ tran_content=self._translate_text_large(seg_text),
153
+ partial=partial
154
+ )
155
+ if partial == False:
156
+ self.row_number += 1
157
+ return item
158
+
159
+ def _transcription_processing_loop(self) -> None:
160
+ """主转录处理循环"""
161
+ frame_epoch = 1
162
+ while not self._translate_thread_stop.is_set():
163
+
164
+ if self.frames_np is None:
165
+ time.sleep(0.2)
166
+ continue
167
+
168
+ with self.lock:
169
+ if len(self.segments_queue) >0:
170
+ audio_buffer = self.segments_queue.pop()
171
+ partial = False
172
+ else:
173
+ audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)]# 获取 1.5s * epoch 个音频长度
174
+ partial = True
175
+
176
+ if len(audio_buffer) ==0:
177
+ time.sleep(0.2)
178
+ continue
179
+
180
+ if len(audio_buffer) < int(self.sample_rate):
181
+ silence_audio = np.zeros(self.sample_rate, dtype=np.float32)
182
+ silence_audio[-len(audio_buffer):] = audio_buffer
183
+ audio_buffer = silence_audio
184
+
185
+ logger.debug(f"audio buffer size: {len(audio_buffer) / self.sample_rate:.2f}s")
186
+ # try:
187
+ meta_item = self._transcribe_audio(audio_buffer)
188
+ segments = meta_item.segments
189
+ logger.debug(f"Segments: {segments}")
190
+ if len(segments):
191
+ result = self._process_transcription_results_2(segments, partial)
192
+ self._send_result_to_client(result)
193
+ time.sleep(0.1)
194
+
195
+ if partial == False:
196
+ frame_epoch = 1
197
+ else:
198
+ frame_epoch += 1
199
+ # 处理转录结果并发送到客户端
200
+ # for result in self._process_transcription_results(segments, audio_buffer):
201
+ # self._send_result_to_client(result)
202
+
203
+ # except Exception as e:
204
+ # logger.error(f"Error processing audio: {e}")
205
 
206
+
207
  def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
208
  """转录音频并返回转录片段"""
209
  log_block("Audio buffer length", f"{audio_buffer.shape[0]/self.sample_rate:.2f}", "s")
 
251
  self._translate_time_cost = round(time_diff, 3)
252
  return translated_text
253
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
  def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
257
  """