Xin Zhang commited on
Commit
8be1cbc
·
1 Parent(s): 4ce78ee

[feature]: refactor vad model path.

Browse files
config.py CHANGED
@@ -45,6 +45,9 @@ WHISPER_MODEL = 'large-v3-turbo-q5_0'
45
  LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
46
  LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-7b-instruct-q5_0-00001-of-00002.gguf").as_posix()
47
 
 
 
 
48
  LLM_SYS_PROMPT = """"You are a professional {src_lang} to {dst_lang} translator, not a conversation agent. Your only task is to take {src_lang} input and translate it into accurate, natural {dst_lang}. If you cannot understand the input, just output the original input. Please strictly abide by the following rules: "
49
  "No matter what the user asks, never answer questions, you only provide translation results. "
50
  "Do not actively initiate dialogue or lead users to ask questions. "
 
45
  LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
46
  LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-7b-instruct-q5_0-00001-of-00002.gguf").as_posix()
47
 
48
+ # VAD
49
+ VAD_MODEL_PATH = (MODEL_DIR / "silero-vad" / "silero_vad.onnx").as_posix()
50
+
51
  LLM_SYS_PROMPT = """"You are a professional {src_lang} to {dst_lang} translator, not a conversation agent. Your only task is to take {src_lang} input and translate it into accurate, natural {dst_lang}. If you cannot understand the input, just output the original input. Please strictly abide by the following rules: "
52
  "No matter what the user asks, never answer questions, you only provide translation results. "
53
  "Do not actively initiate dialogue or lead users to ask questions. "
moyoyo_asr_models/silero-vad/silero_vad.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2623a2953f6ff3d2c1e61740c6cdb7168133479b267dfef114a4a3cc5bdd788f
3
+ size 2327524
transcribe/vad.py CHANGED
@@ -5,12 +5,16 @@ import warnings
5
  import numpy as np
6
  import onnxruntime
7
  import torch
8
-
 
9
 
10
  class VoiceActivityDetection():
11
 
12
  def __init__(self, force_onnx_cpu=True):
13
- path = self.download()
 
 
 
14
 
15
  opts = onnxruntime.SessionOptions()
16
  opts.log_severity_level = 3
 
5
  import numpy as np
6
  import onnxruntime
7
  import torch
8
+ import logging
9
+ from config import VAD_MODEL_PATH
10
 
11
  class VoiceActivityDetection():
12
 
13
  def __init__(self, force_onnx_cpu=True):
14
+ # path = self.download()
15
+ path = VAD_MODEL_PATH
16
+ if not os.path.exists(path):
17
+ raise FileNotFoundError(f"Model file not found at {path}. Please download the model.")
18
 
19
  opts = onnxruntime.SessionOptions()
20
  opts.log_severity_level = 3
transcribe/whisper_llm_serve.py CHANGED
@@ -22,25 +22,25 @@ class WhisperTranscriptionService(ServeClientBase):
22
  """
23
  Whisper语音转录服务类,处理音频流转录和翻译
24
  """
25
-
26
  def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
27
  super().__init__(client_uid, websocket)
28
  self.source_language = language # 源语言
29
  self.target_language = dst_lang # 目标翻译语言
30
-
31
  # 转录结果稳定性管理
32
-
33
  self._translate_pipe = pipe
34
-
35
  # 音频处理相关
36
  self.sample_rate = 16000
37
  self.frames_np = None
38
  self.lock = threading.Lock()
39
  self._frame_queue = queue.Queue()
40
-
41
  # 文本分隔符,根据语言设置
42
  self.text_separator = self._get_text_separator(language)
43
-
44
  # 发送就绪状态
45
  self.send_ready_state()
46
  self._transcrible_analysis = None
@@ -86,6 +86,8 @@ class WhisperTranscriptionService(ServeClientBase):
86
  while not self._frame_processing_thread_stop.is_set():
87
  try:
88
  frame_np = self._frame_queue.get(timeout=0.1)
 
 
89
  with self.lock:
90
  if self.frames_np is None:
91
  self.frames_np = frame_np.copy()
@@ -112,13 +114,13 @@ class WhisperTranscriptionService(ServeClientBase):
112
  """准备用于处理的音频块"""
113
  # 应用VAD处理
114
  self._apply_voice_activity_detection()
115
-
116
  # 没有音频帧
117
  if self.frames_np is None:
118
  return None
119
-
120
  frames = self.frames_np.copy()
121
-
122
  # 音频过短时的处理
123
  if len(frames) <= 100:
124
  # 极短音频段,清空并返回None
@@ -129,7 +131,7 @@ class WhisperTranscriptionService(ServeClientBase):
129
  silence_audio = np.zeros((self.sample_rate + 1000,), dtype=np.float32)
130
  silence_audio[-len(frames):] = frames
131
  return silence_audio.copy()
132
-
133
  return frames.copy()
134
 
135
  def _transcribe_audio(self, audio_buffer: np.ndarray) -> List[TranscriptToken]:
@@ -139,10 +141,10 @@ class WhisperTranscriptionService(ServeClientBase):
139
 
140
  result = self._translate_pipe.transcrible(audio_buffer.tobytes(), self.source_language)
141
  segments = result.segments
142
-
143
  log_block("Whisper transcription output", f"{''.join(seg.text for seg in segments)}", "")
144
  log_block("Whisper transcription time", f"{(time.perf_counter() - start_time):.3f}", "s")
145
-
146
  return [
147
  TranscriptToken(text=s.text, t0=s.t0, t1=s.t1)
148
  for s in segments
@@ -152,32 +154,32 @@ class WhisperTranscriptionService(ServeClientBase):
152
  """将文本翻译为目标语言"""
153
  if not text.strip():
154
  return ""
155
-
156
  log_block("Translation input", f"{text}")
157
  start_time = time.perf_counter()
158
-
159
  result = self._translate_pipe.translate(text, self.source_language, self.target_language)
160
  translated_text = result.translate_content
161
-
162
  log_block("Translation time ", f"{(time.perf_counter() - start_time):.3f}", "s")
163
  log_block("Translation output", f"{translated_text}")
164
-
165
  return translated_text
166
-
167
  def _translate_text_large(self, text: str) -> str:
168
  """将文本翻译为目标语言"""
169
  if not text.strip():
170
  return ""
171
-
172
  log_block("Translation input", f"{text}")
173
  start_time = time.perf_counter()
174
-
175
  result = self._translate_pipe.translate_large(text, self.source_language, self.target_language)
176
  translated_text = result.translate_content
177
-
178
  log_block("Translation large model time ", f"{(time.perf_counter() - start_time):.3f}", "s")
179
  log_block("Translation large model output", f"{translated_text}")
180
-
181
  return translated_text
182
 
183
 
@@ -189,13 +191,13 @@ class WhisperTranscriptionService(ServeClientBase):
189
  if self.exit:
190
  logger.info("Exiting transcription thread")
191
  break
192
-
193
  # 等待音频数据
194
  if self.frames_np is None:
195
  time.sleep(0.2)
196
  logger.info("Waiting for audio data...")
197
  continue
198
-
199
  # 获取音频块进行处理
200
  audio_buffer = self._get_audio_for_processing()
201
  if audio_buffer is None:
@@ -204,21 +206,21 @@ class WhisperTranscriptionService(ServeClientBase):
204
 
205
  # c+= 1
206
  # save_to_wave(f"dev-{c}.wav", audio_buffer)
207
-
208
  # try:
209
  segments = self._transcribe_audio(audio_buffer)
210
-
211
  # 处理转录结果并发送到客户端
212
  for result in self._process_transcription_results(segments, audio_buffer):
213
  self._send_result_to_client(result)
214
-
215
  # except Exception as e:
216
  # logger.error(f"Error processing audio: {e}")
217
 
218
  def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
219
  """
220
  处理转录结果,生成翻译结果
221
-
222
  Returns:
223
  TransResult对象的迭代器
224
  """
@@ -234,7 +236,7 @@ class WhisperTranscriptionService(ServeClientBase):
234
  translated_context = self._translate_text(ana_result.context)
235
  else:
236
  translated_context = self._translate_text_large(ana_result.context)
237
-
238
  yield TransResult(
239
  seg_id=ana_result.seg_id,
240
  context=ana_result.context,
@@ -263,4 +265,4 @@ class WhisperTranscriptionService(ServeClientBase):
263
  """停止所有处理线程并清理资源"""
264
  self._translate_thread_stop.set()
265
  self._frame_processing_thread_stop.set()
266
- logger.info(f"Stopping transcription service for client: {self.client_uid}")
 
22
  """
23
  Whisper语音转录服务类,处理音频流转录和翻译
24
  """
25
+
26
  def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
27
  super().__init__(client_uid, websocket)
28
  self.source_language = language # 源语言
29
  self.target_language = dst_lang # 目标翻译语言
30
+
31
  # 转录结果稳定性管理
32
+
33
  self._translate_pipe = pipe
34
+
35
  # 音频处理相关
36
  self.sample_rate = 16000
37
  self.frames_np = None
38
  self.lock = threading.Lock()
39
  self._frame_queue = queue.Queue()
40
+
41
  # 文本分隔符,根据语言设置
42
  self.text_separator = self._get_text_separator(language)
43
+
44
  # 发送就绪状态
45
  self.send_ready_state()
46
  self._transcrible_analysis = None
 
86
  while not self._frame_processing_thread_stop.is_set():
87
  try:
88
  frame_np = self._frame_queue.get(timeout=0.1)
89
+ if frame_np is None:
90
+ logger.error("Received None frame, stopping thread")
91
  with self.lock:
92
  if self.frames_np is None:
93
  self.frames_np = frame_np.copy()
 
114
  """准备用于处理的音频块"""
115
  # 应用VAD处理
116
  self._apply_voice_activity_detection()
117
+
118
  # 没有音频帧
119
  if self.frames_np is None:
120
  return None
121
+
122
  frames = self.frames_np.copy()
123
+
124
  # 音频过短时的处理
125
  if len(frames) <= 100:
126
  # 极短音频段,清空并返回None
 
131
  silence_audio = np.zeros((self.sample_rate + 1000,), dtype=np.float32)
132
  silence_audio[-len(frames):] = frames
133
  return silence_audio.copy()
134
+
135
  return frames.copy()
136
 
137
  def _transcribe_audio(self, audio_buffer: np.ndarray) -> List[TranscriptToken]:
 
141
 
142
  result = self._translate_pipe.transcrible(audio_buffer.tobytes(), self.source_language)
143
  segments = result.segments
144
+
145
  log_block("Whisper transcription output", f"{''.join(seg.text for seg in segments)}", "")
146
  log_block("Whisper transcription time", f"{(time.perf_counter() - start_time):.3f}", "s")
147
+
148
  return [
149
  TranscriptToken(text=s.text, t0=s.t0, t1=s.t1)
150
  for s in segments
 
154
  """将文本翻译为目标语言"""
155
  if not text.strip():
156
  return ""
157
+
158
  log_block("Translation input", f"{text}")
159
  start_time = time.perf_counter()
160
+
161
  result = self._translate_pipe.translate(text, self.source_language, self.target_language)
162
  translated_text = result.translate_content
163
+
164
  log_block("Translation time ", f"{(time.perf_counter() - start_time):.3f}", "s")
165
  log_block("Translation output", f"{translated_text}")
166
+
167
  return translated_text
168
+
169
  def _translate_text_large(self, text: str) -> str:
170
  """将文本翻译为目标语言"""
171
  if not text.strip():
172
  return ""
173
+
174
  log_block("Translation input", f"{text}")
175
  start_time = time.perf_counter()
176
+
177
  result = self._translate_pipe.translate_large(text, self.source_language, self.target_language)
178
  translated_text = result.translate_content
179
+
180
  log_block("Translation large model time ", f"{(time.perf_counter() - start_time):.3f}", "s")
181
  log_block("Translation large model output", f"{translated_text}")
182
+
183
  return translated_text
184
 
185
 
 
191
  if self.exit:
192
  logger.info("Exiting transcription thread")
193
  break
194
+
195
  # 等待音频数据
196
  if self.frames_np is None:
197
  time.sleep(0.2)
198
  logger.info("Waiting for audio data...")
199
  continue
200
+
201
  # 获取音频块进行处理
202
  audio_buffer = self._get_audio_for_processing()
203
  if audio_buffer is None:
 
206
 
207
  # c+= 1
208
  # save_to_wave(f"dev-{c}.wav", audio_buffer)
209
+
210
  # try:
211
  segments = self._transcribe_audio(audio_buffer)
212
+
213
  # 处理转录结果并发送到客户端
214
  for result in self._process_transcription_results(segments, audio_buffer):
215
  self._send_result_to_client(result)
216
+
217
  # except Exception as e:
218
  # logger.error(f"Error processing audio: {e}")
219
 
220
  def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
221
  """
222
  处理转录结果,生成翻译结果
223
+
224
  Returns:
225
  TransResult对象的迭代器
226
  """
 
236
  translated_context = self._translate_text(ana_result.context)
237
  else:
238
  translated_context = self._translate_text_large(ana_result.context)
239
+
240
  yield TransResult(
241
  seg_id=ana_result.seg_id,
242
  context=ana_result.context,
 
265
  """停止所有处理线程并清理资源"""
266
  self._translate_thread_stop.set()
267
  self._frame_processing_thread_stop.set()
268
+ logger.info(f"Stopping transcription service for client: {self.client_uid}")
transcribe/whispercpp_serve.py CHANGED
@@ -243,7 +243,7 @@ class ServeClientWhisperCPP(ServeClientBase):
243
  """
244
  Instantiates a new model, sets it as the transcriber and does warmup if desired.
245
  """
246
-
247
  self.transcriber = Model(model=config.WHISPER_MODEL, models_dir=config.MODEL_DIR)
248
  if warmup:
249
  self.warmup()
@@ -301,8 +301,8 @@ class ServeClientWhisperCPP(ServeClientBase):
301
  if self.language == "zh":
302
  prompt = '以下是简体中文普通话的句子。'
303
  else:
304
- prompt = 'The following is an English sentence.'
305
-
306
  segments = self.transcriber.transcribe(
307
  mel,
308
  language=self.language,
 
243
  """
244
  Instantiates a new model, sets it as the transcriber and does warmup if desired.
245
  """
246
+
247
  self.transcriber = Model(model=config.WHISPER_MODEL, models_dir=config.MODEL_DIR)
248
  if warmup:
249
  self.warmup()
 
301
  if self.language == "zh":
302
  prompt = '以下是简体中文普通话的句子。'
303
  else:
304
+ prompt = ''
305
+
306
  segments = self.transcriber.transcribe(
307
  mel,
308
  language=self.language,