Xin Zhang
commited on
Commit
·
8be1cbc
1
Parent(s):
4ce78ee
[feature]: refactor vad model path.
Browse files- config.py +3 -0
- moyoyo_asr_models/silero-vad/silero_vad.onnx +3 -0
- transcribe/vad.py +6 -2
- transcribe/whisper_llm_serve.py +31 -29
- transcribe/whispercpp_serve.py +3 -3
config.py
CHANGED
|
@@ -45,6 +45,9 @@ WHISPER_MODEL = 'large-v3-turbo-q5_0'
|
|
| 45 |
LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
| 46 |
LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-7b-instruct-q5_0-00001-of-00002.gguf").as_posix()
|
| 47 |
|
|
|
|
|
|
|
|
|
|
| 48 |
LLM_SYS_PROMPT = """"You are a professional {src_lang} to {dst_lang} translator, not a conversation agent. Your only task is to take {src_lang} input and translate it into accurate, natural {dst_lang}. If you cannot understand the input, just output the original input. Please strictly abide by the following rules: "
|
| 49 |
"No matter what the user asks, never answer questions, you only provide translation results. "
|
| 50 |
"Do not actively initiate dialogue or lead users to ask questions. "
|
|
|
|
| 45 |
LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
| 46 |
LLM_LARGE_MODEL_PATH = (MODEL_DIR / "qwen2.5-7b-instruct-q5_0-00001-of-00002.gguf").as_posix()
|
| 47 |
|
| 48 |
+
# VAD
|
| 49 |
+
VAD_MODEL_PATH = (MODEL_DIR / "silero-vad" / "silero_vad.onnx").as_posix()
|
| 50 |
+
|
| 51 |
LLM_SYS_PROMPT = """"You are a professional {src_lang} to {dst_lang} translator, not a conversation agent. Your only task is to take {src_lang} input and translate it into accurate, natural {dst_lang}. If you cannot understand the input, just output the original input. Please strictly abide by the following rules: "
|
| 52 |
"No matter what the user asks, never answer questions, you only provide translation results. "
|
| 53 |
"Do not actively initiate dialogue or lead users to ask questions. "
|
moyoyo_asr_models/silero-vad/silero_vad.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2623a2953f6ff3d2c1e61740c6cdb7168133479b267dfef114a4a3cc5bdd788f
|
| 3 |
+
size 2327524
|
transcribe/vad.py
CHANGED
|
@@ -5,12 +5,16 @@ import warnings
|
|
| 5 |
import numpy as np
|
| 6 |
import onnxruntime
|
| 7 |
import torch
|
| 8 |
-
|
|
|
|
| 9 |
|
| 10 |
class VoiceActivityDetection():
|
| 11 |
|
| 12 |
def __init__(self, force_onnx_cpu=True):
|
| 13 |
-
path = self.download()
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
opts = onnxruntime.SessionOptions()
|
| 16 |
opts.log_severity_level = 3
|
|
|
|
| 5 |
import numpy as np
|
| 6 |
import onnxruntime
|
| 7 |
import torch
|
| 8 |
+
import logging
|
| 9 |
+
from config import VAD_MODEL_PATH
|
| 10 |
|
| 11 |
class VoiceActivityDetection():
|
| 12 |
|
| 13 |
def __init__(self, force_onnx_cpu=True):
|
| 14 |
+
# path = self.download()
|
| 15 |
+
path = VAD_MODEL_PATH
|
| 16 |
+
if not os.path.exists(path):
|
| 17 |
+
raise FileNotFoundError(f"Model file not found at {path}. Please download the model.")
|
| 18 |
|
| 19 |
opts = onnxruntime.SessionOptions()
|
| 20 |
opts.log_severity_level = 3
|
transcribe/whisper_llm_serve.py
CHANGED
|
@@ -22,25 +22,25 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 22 |
"""
|
| 23 |
Whisper语音转录服务类,处理音频流转录和翻译
|
| 24 |
"""
|
| 25 |
-
|
| 26 |
def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
|
| 27 |
super().__init__(client_uid, websocket)
|
| 28 |
self.source_language = language # 源语言
|
| 29 |
self.target_language = dst_lang # 目标翻译语言
|
| 30 |
-
|
| 31 |
# 转录结果稳定性管理
|
| 32 |
-
|
| 33 |
self._translate_pipe = pipe
|
| 34 |
-
|
| 35 |
# 音频处理相关
|
| 36 |
self.sample_rate = 16000
|
| 37 |
self.frames_np = None
|
| 38 |
self.lock = threading.Lock()
|
| 39 |
self._frame_queue = queue.Queue()
|
| 40 |
-
|
| 41 |
# 文本分隔符,根据语言设置
|
| 42 |
self.text_separator = self._get_text_separator(language)
|
| 43 |
-
|
| 44 |
# 发送就绪状态
|
| 45 |
self.send_ready_state()
|
| 46 |
self._transcrible_analysis = None
|
|
@@ -86,6 +86,8 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 86 |
while not self._frame_processing_thread_stop.is_set():
|
| 87 |
try:
|
| 88 |
frame_np = self._frame_queue.get(timeout=0.1)
|
|
|
|
|
|
|
| 89 |
with self.lock:
|
| 90 |
if self.frames_np is None:
|
| 91 |
self.frames_np = frame_np.copy()
|
|
@@ -112,13 +114,13 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 112 |
"""准备用于处理的音频块"""
|
| 113 |
# 应用VAD处理
|
| 114 |
self._apply_voice_activity_detection()
|
| 115 |
-
|
| 116 |
# 没有音频帧
|
| 117 |
if self.frames_np is None:
|
| 118 |
return None
|
| 119 |
-
|
| 120 |
frames = self.frames_np.copy()
|
| 121 |
-
|
| 122 |
# 音频过短时的处理
|
| 123 |
if len(frames) <= 100:
|
| 124 |
# 极短音频段,清空并返回None
|
|
@@ -129,7 +131,7 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 129 |
silence_audio = np.zeros((self.sample_rate + 1000,), dtype=np.float32)
|
| 130 |
silence_audio[-len(frames):] = frames
|
| 131 |
return silence_audio.copy()
|
| 132 |
-
|
| 133 |
return frames.copy()
|
| 134 |
|
| 135 |
def _transcribe_audio(self, audio_buffer: np.ndarray) -> List[TranscriptToken]:
|
|
@@ -139,10 +141,10 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 139 |
|
| 140 |
result = self._translate_pipe.transcrible(audio_buffer.tobytes(), self.source_language)
|
| 141 |
segments = result.segments
|
| 142 |
-
|
| 143 |
log_block("Whisper transcription output", f"{''.join(seg.text for seg in segments)}", "")
|
| 144 |
log_block("Whisper transcription time", f"{(time.perf_counter() - start_time):.3f}", "s")
|
| 145 |
-
|
| 146 |
return [
|
| 147 |
TranscriptToken(text=s.text, t0=s.t0, t1=s.t1)
|
| 148 |
for s in segments
|
|
@@ -152,32 +154,32 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 152 |
"""将文本翻译为目标语言"""
|
| 153 |
if not text.strip():
|
| 154 |
return ""
|
| 155 |
-
|
| 156 |
log_block("Translation input", f"{text}")
|
| 157 |
start_time = time.perf_counter()
|
| 158 |
-
|
| 159 |
result = self._translate_pipe.translate(text, self.source_language, self.target_language)
|
| 160 |
translated_text = result.translate_content
|
| 161 |
-
|
| 162 |
log_block("Translation time ", f"{(time.perf_counter() - start_time):.3f}", "s")
|
| 163 |
log_block("Translation output", f"{translated_text}")
|
| 164 |
-
|
| 165 |
return translated_text
|
| 166 |
-
|
| 167 |
def _translate_text_large(self, text: str) -> str:
|
| 168 |
"""将文本翻译为目标语言"""
|
| 169 |
if not text.strip():
|
| 170 |
return ""
|
| 171 |
-
|
| 172 |
log_block("Translation input", f"{text}")
|
| 173 |
start_time = time.perf_counter()
|
| 174 |
-
|
| 175 |
result = self._translate_pipe.translate_large(text, self.source_language, self.target_language)
|
| 176 |
translated_text = result.translate_content
|
| 177 |
-
|
| 178 |
log_block("Translation large model time ", f"{(time.perf_counter() - start_time):.3f}", "s")
|
| 179 |
log_block("Translation large model output", f"{translated_text}")
|
| 180 |
-
|
| 181 |
return translated_text
|
| 182 |
|
| 183 |
|
|
@@ -189,13 +191,13 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 189 |
if self.exit:
|
| 190 |
logger.info("Exiting transcription thread")
|
| 191 |
break
|
| 192 |
-
|
| 193 |
# 等待音频数据
|
| 194 |
if self.frames_np is None:
|
| 195 |
time.sleep(0.2)
|
| 196 |
logger.info("Waiting for audio data...")
|
| 197 |
continue
|
| 198 |
-
|
| 199 |
# 获取音频块进行处理
|
| 200 |
audio_buffer = self._get_audio_for_processing()
|
| 201 |
if audio_buffer is None:
|
|
@@ -204,21 +206,21 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 204 |
|
| 205 |
# c+= 1
|
| 206 |
# save_to_wave(f"dev-{c}.wav", audio_buffer)
|
| 207 |
-
|
| 208 |
# try:
|
| 209 |
segments = self._transcribe_audio(audio_buffer)
|
| 210 |
-
|
| 211 |
# 处理转录结果并发送到客户端
|
| 212 |
for result in self._process_transcription_results(segments, audio_buffer):
|
| 213 |
self._send_result_to_client(result)
|
| 214 |
-
|
| 215 |
# except Exception as e:
|
| 216 |
# logger.error(f"Error processing audio: {e}")
|
| 217 |
|
| 218 |
def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
|
| 219 |
"""
|
| 220 |
处理转录结果,生成翻译结果
|
| 221 |
-
|
| 222 |
Returns:
|
| 223 |
TransResult对象的迭代器
|
| 224 |
"""
|
|
@@ -234,7 +236,7 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 234 |
translated_context = self._translate_text(ana_result.context)
|
| 235 |
else:
|
| 236 |
translated_context = self._translate_text_large(ana_result.context)
|
| 237 |
-
|
| 238 |
yield TransResult(
|
| 239 |
seg_id=ana_result.seg_id,
|
| 240 |
context=ana_result.context,
|
|
@@ -263,4 +265,4 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 263 |
"""停止所有处理线程并清理资源"""
|
| 264 |
self._translate_thread_stop.set()
|
| 265 |
self._frame_processing_thread_stop.set()
|
| 266 |
-
logger.info(f"Stopping transcription service for client: {self.client_uid}")
|
|
|
|
| 22 |
"""
|
| 23 |
Whisper语音转录服务类,处理音频流转录和翻译
|
| 24 |
"""
|
| 25 |
+
|
| 26 |
def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
|
| 27 |
super().__init__(client_uid, websocket)
|
| 28 |
self.source_language = language # 源语言
|
| 29 |
self.target_language = dst_lang # 目标翻译语言
|
| 30 |
+
|
| 31 |
# 转录结果稳定性管理
|
| 32 |
+
|
| 33 |
self._translate_pipe = pipe
|
| 34 |
+
|
| 35 |
# 音频处理相关
|
| 36 |
self.sample_rate = 16000
|
| 37 |
self.frames_np = None
|
| 38 |
self.lock = threading.Lock()
|
| 39 |
self._frame_queue = queue.Queue()
|
| 40 |
+
|
| 41 |
# 文本分隔符,根据语言设置
|
| 42 |
self.text_separator = self._get_text_separator(language)
|
| 43 |
+
|
| 44 |
# 发送就绪状态
|
| 45 |
self.send_ready_state()
|
| 46 |
self._transcrible_analysis = None
|
|
|
|
| 86 |
while not self._frame_processing_thread_stop.is_set():
|
| 87 |
try:
|
| 88 |
frame_np = self._frame_queue.get(timeout=0.1)
|
| 89 |
+
if frame_np is None:
|
| 90 |
+
logger.error("Received None frame, stopping thread")
|
| 91 |
with self.lock:
|
| 92 |
if self.frames_np is None:
|
| 93 |
self.frames_np = frame_np.copy()
|
|
|
|
| 114 |
"""准备用于处理的音频块"""
|
| 115 |
# 应用VAD处理
|
| 116 |
self._apply_voice_activity_detection()
|
| 117 |
+
|
| 118 |
# 没有音频帧
|
| 119 |
if self.frames_np is None:
|
| 120 |
return None
|
| 121 |
+
|
| 122 |
frames = self.frames_np.copy()
|
| 123 |
+
|
| 124 |
# 音频过短时的处理
|
| 125 |
if len(frames) <= 100:
|
| 126 |
# 极短音频段,清空并返回None
|
|
|
|
| 131 |
silence_audio = np.zeros((self.sample_rate + 1000,), dtype=np.float32)
|
| 132 |
silence_audio[-len(frames):] = frames
|
| 133 |
return silence_audio.copy()
|
| 134 |
+
|
| 135 |
return frames.copy()
|
| 136 |
|
| 137 |
def _transcribe_audio(self, audio_buffer: np.ndarray) -> List[TranscriptToken]:
|
|
|
|
| 141 |
|
| 142 |
result = self._translate_pipe.transcrible(audio_buffer.tobytes(), self.source_language)
|
| 143 |
segments = result.segments
|
| 144 |
+
|
| 145 |
log_block("Whisper transcription output", f"{''.join(seg.text for seg in segments)}", "")
|
| 146 |
log_block("Whisper transcription time", f"{(time.perf_counter() - start_time):.3f}", "s")
|
| 147 |
+
|
| 148 |
return [
|
| 149 |
TranscriptToken(text=s.text, t0=s.t0, t1=s.t1)
|
| 150 |
for s in segments
|
|
|
|
| 154 |
"""将文本翻译为目标语言"""
|
| 155 |
if not text.strip():
|
| 156 |
return ""
|
| 157 |
+
|
| 158 |
log_block("Translation input", f"{text}")
|
| 159 |
start_time = time.perf_counter()
|
| 160 |
+
|
| 161 |
result = self._translate_pipe.translate(text, self.source_language, self.target_language)
|
| 162 |
translated_text = result.translate_content
|
| 163 |
+
|
| 164 |
log_block("Translation time ", f"{(time.perf_counter() - start_time):.3f}", "s")
|
| 165 |
log_block("Translation output", f"{translated_text}")
|
| 166 |
+
|
| 167 |
return translated_text
|
| 168 |
+
|
| 169 |
def _translate_text_large(self, text: str) -> str:
|
| 170 |
"""将文本翻译为目标语言"""
|
| 171 |
if not text.strip():
|
| 172 |
return ""
|
| 173 |
+
|
| 174 |
log_block("Translation input", f"{text}")
|
| 175 |
start_time = time.perf_counter()
|
| 176 |
+
|
| 177 |
result = self._translate_pipe.translate_large(text, self.source_language, self.target_language)
|
| 178 |
translated_text = result.translate_content
|
| 179 |
+
|
| 180 |
log_block("Translation large model time ", f"{(time.perf_counter() - start_time):.3f}", "s")
|
| 181 |
log_block("Translation large model output", f"{translated_text}")
|
| 182 |
+
|
| 183 |
return translated_text
|
| 184 |
|
| 185 |
|
|
|
|
| 191 |
if self.exit:
|
| 192 |
logger.info("Exiting transcription thread")
|
| 193 |
break
|
| 194 |
+
|
| 195 |
# 等待音频数据
|
| 196 |
if self.frames_np is None:
|
| 197 |
time.sleep(0.2)
|
| 198 |
logger.info("Waiting for audio data...")
|
| 199 |
continue
|
| 200 |
+
|
| 201 |
# 获取音频块进行处理
|
| 202 |
audio_buffer = self._get_audio_for_processing()
|
| 203 |
if audio_buffer is None:
|
|
|
|
| 206 |
|
| 207 |
# c+= 1
|
| 208 |
# save_to_wave(f"dev-{c}.wav", audio_buffer)
|
| 209 |
+
|
| 210 |
# try:
|
| 211 |
segments = self._transcribe_audio(audio_buffer)
|
| 212 |
+
|
| 213 |
# 处理转录结果并发送到客户端
|
| 214 |
for result in self._process_transcription_results(segments, audio_buffer):
|
| 215 |
self._send_result_to_client(result)
|
| 216 |
+
|
| 217 |
# except Exception as e:
|
| 218 |
# logger.error(f"Error processing audio: {e}")
|
| 219 |
|
| 220 |
def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
|
| 221 |
"""
|
| 222 |
处理转录结果,生成翻译结果
|
| 223 |
+
|
| 224 |
Returns:
|
| 225 |
TransResult对象的迭代器
|
| 226 |
"""
|
|
|
|
| 236 |
translated_context = self._translate_text(ana_result.context)
|
| 237 |
else:
|
| 238 |
translated_context = self._translate_text_large(ana_result.context)
|
| 239 |
+
|
| 240 |
yield TransResult(
|
| 241 |
seg_id=ana_result.seg_id,
|
| 242 |
context=ana_result.context,
|
|
|
|
| 265 |
"""停止所有处理线程并清理资源"""
|
| 266 |
self._translate_thread_stop.set()
|
| 267 |
self._frame_processing_thread_stop.set()
|
| 268 |
+
logger.info(f"Stopping transcription service for client: {self.client_uid}")
|
transcribe/whispercpp_serve.py
CHANGED
|
@@ -243,7 +243,7 @@ class ServeClientWhisperCPP(ServeClientBase):
|
|
| 243 |
"""
|
| 244 |
Instantiates a new model, sets it as the transcriber and does warmup if desired.
|
| 245 |
"""
|
| 246 |
-
|
| 247 |
self.transcriber = Model(model=config.WHISPER_MODEL, models_dir=config.MODEL_DIR)
|
| 248 |
if warmup:
|
| 249 |
self.warmup()
|
|
@@ -301,8 +301,8 @@ class ServeClientWhisperCPP(ServeClientBase):
|
|
| 301 |
if self.language == "zh":
|
| 302 |
prompt = '以下是简体中文普通话的句子。'
|
| 303 |
else:
|
| 304 |
-
prompt = '
|
| 305 |
-
|
| 306 |
segments = self.transcriber.transcribe(
|
| 307 |
mel,
|
| 308 |
language=self.language,
|
|
|
|
| 243 |
"""
|
| 244 |
Instantiates a new model, sets it as the transcriber and does warmup if desired.
|
| 245 |
"""
|
| 246 |
+
|
| 247 |
self.transcriber = Model(model=config.WHISPER_MODEL, models_dir=config.MODEL_DIR)
|
| 248 |
if warmup:
|
| 249 |
self.warmup()
|
|
|
|
| 301 |
if self.language == "zh":
|
| 302 |
prompt = '以下是简体中文普通话的句子。'
|
| 303 |
else:
|
| 304 |
+
prompt = ''
|
| 305 |
+
|
| 306 |
segments = self.transcriber.transcribe(
|
| 307 |
mel,
|
| 308 |
language=self.language,
|