daihui.zhang
commited on
Commit
·
ca5d527
1
Parent(s):
484b9cf
update to vad streaming
Browse files- transcribe/helpers/vadprocessor.py +262 -1
- transcribe/pipelines/base.py +1 -0
- transcribe/pipelines/pipe_vad.py +72 -18
- transcribe/translatepipes.py +3 -3
- transcribe/whisper_llm_serve.py +93 -58
transcribe/helpers/vadprocessor.py
CHANGED
|
@@ -2,10 +2,15 @@ from copy import deepcopy
|
|
| 2 |
from queue import Queue, Empty
|
| 3 |
from time import time
|
| 4 |
from config import VAD_MODEL_PATH
|
| 5 |
-
|
| 6 |
import numpy as np
|
| 7 |
import onnxruntime
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
|
|
|
| 9 |
class OnnxWrapper():
|
| 10 |
|
| 11 |
def __init__(self, path, force_onnx_cpu=False):
|
|
@@ -178,6 +183,33 @@ class VADIteratorOnnx:
|
|
| 178 |
|
| 179 |
|
| 180 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
class VadV2:
|
| 182 |
def __init__(self,
|
| 183 |
threshold: float = 0.5,
|
|
@@ -269,6 +301,235 @@ class VadV2:
|
|
| 269 |
return None
|
| 270 |
|
| 271 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
class VadProcessor:
|
| 274 |
def __init__(
|
|
|
|
| 2 |
from queue import Queue, Empty
|
| 3 |
from time import time
|
| 4 |
from config import VAD_MODEL_PATH
|
| 5 |
+
from silero_vad import load_silero_vad
|
| 6 |
import numpy as np
|
| 7 |
import onnxruntime
|
| 8 |
+
import logging
|
| 9 |
+
from datetime import timedelta
|
| 10 |
+
import gc
|
| 11 |
+
from pydub import AudioSegment
|
| 12 |
|
| 13 |
+
|
| 14 |
class OnnxWrapper():
|
| 15 |
|
| 16 |
def __init__(self, path, force_onnx_cpu=False):
|
|
|
|
| 183 |
|
| 184 |
|
| 185 |
|
| 186 |
+
|
| 187 |
+
class FixedVADIterator(VADIteratorOnnx):
|
| 188 |
+
'''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
|
| 189 |
+
If audio to be processed at once is long and multiple voiced segments detected,
|
| 190 |
+
then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
|
| 191 |
+
'''
|
| 192 |
+
|
| 193 |
+
def reset_states(self):
|
| 194 |
+
super().reset_states()
|
| 195 |
+
self.buffer = np.array([],dtype=np.float32)
|
| 196 |
+
|
| 197 |
+
def __call__(self, x, return_seconds=False):
|
| 198 |
+
self.buffer = np.append(self.buffer, x)
|
| 199 |
+
ret = None
|
| 200 |
+
while len(self.buffer) >= 512:
|
| 201 |
+
r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
|
| 202 |
+
self.buffer = self.buffer[512:]
|
| 203 |
+
if ret is None:
|
| 204 |
+
ret = r
|
| 205 |
+
elif r is not None:
|
| 206 |
+
if 'end' in r:
|
| 207 |
+
ret['end'] = r['end'] # the latter end
|
| 208 |
+
if 'start' in r and 'end' in ret: # there is an earlier start.
|
| 209 |
+
# Remove end, merging this segment with the previous one.
|
| 210 |
+
del ret['end']
|
| 211 |
+
return ret if ret != {} else None
|
| 212 |
+
|
| 213 |
class VadV2:
|
| 214 |
def __init__(self,
|
| 215 |
threshold: float = 0.5,
|
|
|
|
| 301 |
return None
|
| 302 |
|
| 303 |
|
| 304 |
+
class SileroVADProcessor:
|
| 305 |
+
"""
|
| 306 |
+
A class for processing audio files using Silero VAD to detect voice activity
|
| 307 |
+
and extract voice segments from audio files.
|
| 308 |
+
"""
|
| 309 |
+
|
| 310 |
+
def __init__(self,
|
| 311 |
+
activate_threshold=0.5,
|
| 312 |
+
fusion_threshold=0.3,
|
| 313 |
+
min_speech_duration=0.25,
|
| 314 |
+
max_speech_duration=20,
|
| 315 |
+
min_silence_duration=250,
|
| 316 |
+
sample_rate=16000,
|
| 317 |
+
ort_providers=None):
|
| 318 |
+
"""
|
| 319 |
+
Initialize the SileroVADProcessor.
|
| 320 |
+
Args:
|
| 321 |
+
activate_threshold (float): Threshold for voice activity detection
|
| 322 |
+
fusion_threshold (float): Threshold for merging close speech segments (seconds)
|
| 323 |
+
min_speech_duration (float): Minimum duration of speech to be considered valid (seconds)
|
| 324 |
+
max_speech_duration (float): Maximum duration of speech (seconds)
|
| 325 |
+
min_silence_duration (int): Minimum silence duration (ms)
|
| 326 |
+
sample_rate (int): Sample rate of the audio (8000 or 16000 Hz)
|
| 327 |
+
ort_providers (list): ONNX Runtime providers for acceleration
|
| 328 |
+
"""
|
| 329 |
+
# VAD parameters
|
| 330 |
+
self.activate_threshold = activate_threshold
|
| 331 |
+
self.fusion_threshold = fusion_threshold
|
| 332 |
+
self.min_speech_duration = min_speech_duration
|
| 333 |
+
self.max_speech_duration = max_speech_duration
|
| 334 |
+
self.min_silence_duration = min_silence_duration
|
| 335 |
+
self.sample_rate = sample_rate
|
| 336 |
+
self.ort_providers = ort_providers if ort_providers else []
|
| 337 |
+
|
| 338 |
+
# Initialize logger
|
| 339 |
+
self.logger = logging.getLogger(__name__)
|
| 340 |
+
|
| 341 |
+
# Load Silero VAD model
|
| 342 |
+
self._init_onnx_session()
|
| 343 |
+
self.silero_vad = load_silero_vad(onnx=True)
|
| 344 |
+
|
| 345 |
+
def _init_onnx_session(self):
|
| 346 |
+
"""Initialize ONNX Runtime session with appropriate settings."""
|
| 347 |
+
session_opts = onnxruntime.SessionOptions()
|
| 348 |
+
session_opts.log_severity_level = 3
|
| 349 |
+
session_opts.inter_op_num_threads = 0
|
| 350 |
+
session_opts.intra_op_num_threads = 0
|
| 351 |
+
session_opts.enable_cpu_mem_arena = True
|
| 352 |
+
session_opts.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
|
| 353 |
+
session_opts.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
| 354 |
+
|
| 355 |
+
session_opts.add_session_config_entry("session.intra_op.allow_spinning", "1")
|
| 356 |
+
session_opts.add_session_config_entry("session.inter_op.allow_spinning", "1")
|
| 357 |
+
session_opts.add_session_config_entry("session.set_denormal_as_zero", "1")
|
| 358 |
+
|
| 359 |
+
# Set the session_opts to be used by silero_vad
|
| 360 |
+
# onnxruntime.capi._pybind_state.get_default_session_options(session_opts)
|
| 361 |
+
|
| 362 |
+
def load_audio(self, audio_path):
|
| 363 |
+
"""
|
| 364 |
+
Load audio file and prepare it for VAD processing.
|
| 365 |
+
Args:
|
| 366 |
+
audio_path (str): Path to the audio file
|
| 367 |
+
Returns:
|
| 368 |
+
numpy.ndarray: Audio data as numpy array
|
| 369 |
+
"""
|
| 370 |
+
self.logger.info(f"Loading audio from {audio_path}")
|
| 371 |
+
audio_segment = AudioSegment.from_file(audio_path)
|
| 372 |
+
audio_segment = audio_segment.set_channels(1).set_frame_rate(self.sample_rate)
|
| 373 |
+
|
| 374 |
+
# Convert to numpy array and normalize
|
| 375 |
+
dtype = np.float16 if self.use_gpu_fp16 else np.float32
|
| 376 |
+
audio_array = np.array(audio_segment.get_array_of_samples(), dtype=dtype) * 0.000030517578 # 1/32768
|
| 377 |
+
|
| 378 |
+
self.audio_segment = audio_segment # Store for later use
|
| 379 |
+
return audio_array
|
| 380 |
+
|
| 381 |
+
@property
|
| 382 |
+
def model(self):
|
| 383 |
+
return self.silero_vad
|
| 384 |
+
|
| 385 |
+
def process_timestamps(self, timestamps):
|
| 386 |
+
"""
|
| 387 |
+
Process VAD timestamps: filter short segments and merge close segments.
|
| 388 |
+
Args:
|
| 389 |
+
timestamps (list): List of (start, end) tuples
|
| 390 |
+
Returns:
|
| 391 |
+
list: Processed list of (start, end) tuples
|
| 392 |
+
"""
|
| 393 |
+
# Filter out short durations
|
| 394 |
+
filtered_timestamps = [(start, end) for start, end in timestamps
|
| 395 |
+
if (end - start) >= self.min_speech_duration]
|
| 396 |
+
|
| 397 |
+
# Fuse timestamps in two passes for better merging
|
| 398 |
+
fused_timestamps_1st = []
|
| 399 |
+
for start, end in filtered_timestamps:
|
| 400 |
+
if fused_timestamps_1st and (start - fused_timestamps_1st[-1][1] <= self.fusion_threshold):
|
| 401 |
+
fused_timestamps_1st[-1] = (fused_timestamps_1st[-1][0], end)
|
| 402 |
+
else:
|
| 403 |
+
fused_timestamps_1st.append((start, end))
|
| 404 |
+
|
| 405 |
+
fused_timestamps_2nd = []
|
| 406 |
+
for start, end in fused_timestamps_1st:
|
| 407 |
+
if fused_timestamps_2nd and (start - fused_timestamps_2nd[-1][1] <= self.fusion_threshold):
|
| 408 |
+
fused_timestamps_2nd[-1] = (fused_timestamps_2nd[-1][0], end)
|
| 409 |
+
else:
|
| 410 |
+
fused_timestamps_2nd.append((start, end))
|
| 411 |
+
|
| 412 |
+
return fused_timestamps_2nd
|
| 413 |
+
|
| 414 |
+
def format_time(self, seconds):
|
| 415 |
+
"""
|
| 416 |
+
Convert seconds to VTT time format 'hh:mm:ss.mmm'.
|
| 417 |
+
Args:
|
| 418 |
+
seconds (float): Time in seconds
|
| 419 |
+
Returns:
|
| 420 |
+
str: Formatted time string
|
| 421 |
+
"""
|
| 422 |
+
td = timedelta(seconds=seconds)
|
| 423 |
+
td_sec = td.total_seconds()
|
| 424 |
+
total_seconds = int(td_sec)
|
| 425 |
+
milliseconds = int((td_sec - total_seconds) * 1000)
|
| 426 |
+
hours = total_seconds // 3600
|
| 427 |
+
minutes = (total_seconds % 3600) // 60
|
| 428 |
+
seconds = total_seconds % 60
|
| 429 |
+
return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
|
| 430 |
+
|
| 431 |
+
def detect_speech(self, audio:np.array):
|
| 432 |
+
"""
|
| 433 |
+
Run VAD on the audio file to detect speech segments.
|
| 434 |
+
Args:
|
| 435 |
+
audio_path (str): Path to the audio file
|
| 436 |
+
Returns:
|
| 437 |
+
list: List of processed timestamps as (start, end) tuples
|
| 438 |
+
"""
|
| 439 |
+
self.logger.info("Starting VAD process")
|
| 440 |
+
start_time = time.time()
|
| 441 |
+
# Get speech timestamps
|
| 442 |
+
raw_timestamps = get_speech_timestamps(
|
| 443 |
+
audio,
|
| 444 |
+
model=self.silero_vad,
|
| 445 |
+
threshold=self.activate_threshold,
|
| 446 |
+
max_speech_duration_s=self.max_speech_duration,
|
| 447 |
+
min_speech_duration_ms=int(self.min_speech_duration * 1000),
|
| 448 |
+
min_silence_duration_ms=self.min_silence_duration,
|
| 449 |
+
return_seconds=True
|
| 450 |
+
)
|
| 451 |
+
|
| 452 |
+
# Convert to simple format and process
|
| 453 |
+
timestamps = [(item['start'], item['end']) for item in raw_timestamps]
|
| 454 |
+
processed_timestamps = self.process_timestamps(timestamps)
|
| 455 |
+
|
| 456 |
+
# Clean up
|
| 457 |
+
del audio
|
| 458 |
+
gc.collect()
|
| 459 |
+
|
| 460 |
+
self.logger.info(f"VAD completed in {time.time() - start_time:.3f} seconds")
|
| 461 |
+
return processed_timestamps
|
| 462 |
+
|
| 463 |
+
"""
|
| 464 |
+
Save timestamps in both second and sample indices formats.
|
| 465 |
+
Args:
|
| 466 |
+
timestamps (list): List of (start, end) tuples
|
| 467 |
+
output_prefix (str): Prefix for output files
|
| 468 |
+
"""
|
| 469 |
+
# Save timestamps in seconds (VTT format)
|
| 470 |
+
seconds_path = f"{output_prefix}_timestamps_second.txt"
|
| 471 |
+
with open(seconds_path, "w", encoding='UTF-8') as file:
|
| 472 |
+
self.logger.info("Saving timestamps in seconds format")
|
| 473 |
+
for start, end in timestamps:
|
| 474 |
+
s_time = self.format_time(start)
|
| 475 |
+
e_time = self.format_time(end)
|
| 476 |
+
line = f"{s_time} --> {e_time}\n"
|
| 477 |
+
file.write(line)
|
| 478 |
+
|
| 479 |
+
# Save timestamps in sample indices
|
| 480 |
+
indices_path = f"{output_prefix}_timestamps_indices.txt"
|
| 481 |
+
with open(indices_path, "w", encoding='UTF-8') as file:
|
| 482 |
+
self.logger.info("Saving timestamps in indices format")
|
| 483 |
+
for start, end in timestamps:
|
| 484 |
+
line = f"{int(start * self.sample_rate)} --> {int(end * self.sample_rate)}\n"
|
| 485 |
+
file.write(line)
|
| 486 |
+
|
| 487 |
+
self.logger.info(f"Timestamps saved to {seconds_path} and {indices_path}")
|
| 488 |
+
|
| 489 |
+
def extract_speech_segments(self, audio_segment, timestamps):
|
| 490 |
+
"""
|
| 491 |
+
Extract speech segments from the audio and combine them into a single audio file.
|
| 492 |
+
Args:
|
| 493 |
+
timestamps (list): List of (start, end) tuples indicating speech segments
|
| 494 |
+
Returns:
|
| 495 |
+
AudioSegment: The combined speech segments
|
| 496 |
+
"""
|
| 497 |
+
audio_segment = audio_segment.numpy()
|
| 498 |
+
combined_speech = np.array([], dtype=np.float32)
|
| 499 |
+
|
| 500 |
+
# Extract and combine each speech segment
|
| 501 |
+
for i, (start, end) in enumerate(timestamps):
|
| 502 |
+
# Convert seconds to milliseconds for pydub
|
| 503 |
+
start_ms = int(start * 1000)
|
| 504 |
+
end_ms = int(end * 1000)
|
| 505 |
+
|
| 506 |
+
# Ensure the end time does not exceed the length of the audio segment
|
| 507 |
+
if end_ms > len(audio_segment):
|
| 508 |
+
end_ms = len(audio_segment)
|
| 509 |
+
|
| 510 |
+
# Extract the segment
|
| 511 |
+
segment = audio_segment[start_ms:end_ms]
|
| 512 |
+
|
| 513 |
+
# Add to combined audio
|
| 514 |
+
combined_speech = np.append(combined_speech, segment)
|
| 515 |
+
|
| 516 |
+
return combined_speech
|
| 517 |
+
|
| 518 |
+
def process_audio(self, audio_array:np.array):
|
| 519 |
+
"""
|
| 520 |
+
Complete processing pipeline: detect speech, save timestamps, and optionally extract speech.
|
| 521 |
+
Returns:
|
| 522 |
+
tuple: (timestamps, output_speech_path if extract_speech else None)
|
| 523 |
+
"""
|
| 524 |
+
|
| 525 |
+
# Run VAD to detect speech
|
| 526 |
+
timestamps = self.detect_speech(audio_array)
|
| 527 |
+
|
| 528 |
+
combined_speech = self.extract_speech_segments(audio_array, timestamps)
|
| 529 |
+
|
| 530 |
+
return timestamps, combined_speech
|
| 531 |
+
|
| 532 |
+
|
| 533 |
|
| 534 |
class VadProcessor:
|
| 535 |
def __init__(
|
transcribe/pipelines/base.py
CHANGED
|
@@ -22,6 +22,7 @@ class MetaItem:
|
|
| 22 |
translate_content: str = ''
|
| 23 |
source_language: str = 'zh'
|
| 24 |
destination_language: str = 'en'
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
class BasePipe(Process):
|
|
|
|
| 22 |
translate_content: str = ''
|
| 23 |
source_language: str = 'zh'
|
| 24 |
destination_language: str = 'en'
|
| 25 |
+
speech_status: str = 'END' # "END", "START"
|
| 26 |
|
| 27 |
|
| 28 |
class BasePipe(Process):
|
transcribe/pipelines/pipe_vad.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
|
| 2 |
from .base import MetaItem, BasePipe
|
| 3 |
-
from ..helpers.vadprocessor import
|
|
|
|
| 4 |
import numpy as np
|
| 5 |
from silero_vad import get_speech_timestamps
|
| 6 |
from typing import List
|
|
@@ -12,30 +13,83 @@ import logging
|
|
| 12 |
class VadPipe(BasePipe):
|
| 13 |
vac = None
|
| 14 |
sample_rate = 16000
|
| 15 |
-
window_size_samples = 512
|
| 16 |
-
chunk_size = 512
|
| 17 |
-
prob_threshold=0.5,
|
| 18 |
-
silence_s=0.5,
|
| 19 |
-
cache_s=0.25,
|
| 20 |
-
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
@classmethod
|
| 24 |
def init(cls):
|
| 25 |
if cls.vac is None:
|
| 26 |
-
cls.vac =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
def process(self, in_data: MetaItem) -> MetaItem:
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
else:
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
|
| 39 |
-
# return nr.reduce_noise(y=data, sr=self.sample_rate)
|
| 40 |
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
from .base import MetaItem, BasePipe
|
| 3 |
+
from ..helpers.vadprocessor import FixedVADIterator, SileroVADProcessor
|
| 4 |
+
|
| 5 |
import numpy as np
|
| 6 |
from silero_vad import get_speech_timestamps
|
| 7 |
from typing import List
|
|
|
|
| 13 |
class VadPipe(BasePipe):
|
| 14 |
vac = None
|
| 15 |
sample_rate = 16000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
def __init__(self, in_queue=None, out_queue=None) -> None:
|
| 18 |
+
super().__init__(in_queue, out_queue)
|
| 19 |
+
self._offset = 0 # 处理的frame size offset
|
| 20 |
+
self._status = 'END'
|
| 21 |
+
|
| 22 |
|
| 23 |
+
def reset(self):
|
| 24 |
+
self._offset = 0
|
| 25 |
+
self._status = 'END'
|
| 26 |
+
|
| 27 |
@classmethod
|
| 28 |
def init(cls):
|
| 29 |
if cls.vac is None:
|
| 30 |
+
cls.vac = FixedVADIterator(
|
| 31 |
+
threshold=0.3,
|
| 32 |
+
sampling_rate=cls.sample_rate,
|
| 33 |
+
# speech_pad_ms=10
|
| 34 |
+
min_silence_duration_ms = 100,
|
| 35 |
+
# speech_pad_ms = 30,
|
| 36 |
+
max_speech_duration_s=15
|
| 37 |
+
)
|
| 38 |
+
cls.vac.reset_states()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# def reduce_noise(self, data):
|
| 42 |
+
# return nr.reduce_noise(y=data, sr=self.sample_rate)
|
| 43 |
+
|
| 44 |
+
def _process_speech_chunk(self, source_audio:np.ndarray):
|
| 45 |
+
speech_dict = self.vac(source_audio, return_seconds=False)
|
| 46 |
+
if speech_dict:
|
| 47 |
+
relative_start_frame = None
|
| 48 |
+
relative_end_frame = None
|
| 49 |
+
start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
|
| 50 |
+
if start_frame:
|
| 51 |
+
relative_start_frame = start_frame - self._offset
|
| 52 |
+
if end_frame:
|
| 53 |
+
relative_end_frame = end_frame - self._offset
|
| 54 |
+
return relative_start_frame, relative_end_frame
|
| 55 |
|
| 56 |
def process(self, in_data: MetaItem) -> MetaItem:
|
| 57 |
+
if self._offset == 0:
|
| 58 |
+
self.vac.reset_states()
|
| 59 |
+
# silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
|
| 60 |
+
source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
|
| 61 |
+
speech_data = self._process_speech_chunk(source_audio)
|
| 62 |
+
|
| 63 |
+
if speech_data: # 表示有音频的变化点出现
|
| 64 |
+
rel_start_frame, rel_end_frame = speech_data
|
| 65 |
+
if rel_start_frame and not rel_end_frame:
|
| 66 |
+
self._status = "START" # 语音开始
|
| 67 |
+
target_audio = source_audio[rel_start_frame:]
|
| 68 |
+
logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
|
| 69 |
+
elif not rel_start_frame and rel_end_frame:
|
| 70 |
+
self._status = "END" # 音频结束
|
| 71 |
+
target_audio = source_audio[:rel_end_frame]
|
| 72 |
+
logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
|
| 73 |
+
elif rel_start_frame and rel_end_frame:
|
| 74 |
+
self._status = 'END'
|
| 75 |
+
target_audio = source_audio[rel_start_frame:rel_end_frame]
|
| 76 |
+
logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
|
| 77 |
+
else:
|
| 78 |
+
self._status = 'END'
|
| 79 |
+
target_audio = np.array([],dtype=np.float32)
|
| 80 |
+
# logging.debug("❌ No valid speech segment detected, setting status to END")
|
| 81 |
else:
|
| 82 |
+
if self._status == 'START':
|
| 83 |
+
target_audio = source_audio
|
| 84 |
+
# logging.debug("🔊 Continuing to capture audio as speech is still ongoing")
|
| 85 |
+
else: # end
|
| 86 |
+
target_audio = np.array([],dtype=np.float32)
|
| 87 |
+
# self._status = 'END'
|
| 88 |
+
# logging.debug("❌ No speech detected, setting status to END")
|
| 89 |
|
| 90 |
+
self._offset += len(source_audio)
|
|
|
|
| 91 |
|
| 92 |
+
in_data.audio = target_audio.tobytes()
|
| 93 |
+
in_data.source_audio = b''
|
| 94 |
+
in_data.speech_status = self._status
|
| 95 |
+
return in_data
|
transcribe/translatepipes.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from transcribe.pipelines import WhisperPipe, MetaItem, WhisperChinese, Translate7BPipe, FunASRPipe
|
| 2 |
|
| 3 |
|
| 4 |
class TranslatePipes:
|
|
@@ -9,7 +9,7 @@ class TranslatePipes:
|
|
| 9 |
self._process = []
|
| 10 |
# whisper 转录
|
| 11 |
self._whisper_pipe_en = self._launch_process(WhisperPipe())
|
| 12 |
-
self._whisper_pipe_zh = self._launch_process(WhisperChinese())
|
| 13 |
self._funasr_pipe = self._launch_process(FunASRPipe())
|
| 14 |
|
| 15 |
# llm 翻译
|
|
@@ -17,7 +17,7 @@ class TranslatePipes:
|
|
| 17 |
|
| 18 |
self._translate_7b_pipe = self._launch_process(Translate7BPipe())
|
| 19 |
# vad
|
| 20 |
-
|
| 21 |
|
| 22 |
# def reset(self):
|
| 23 |
# self._vad_pipe.reset()
|
|
|
|
| 1 |
+
from transcribe.pipelines import WhisperPipe, MetaItem, WhisperChinese, Translate7BPipe, FunASRPipe, VadPipe
|
| 2 |
|
| 3 |
|
| 4 |
class TranslatePipes:
|
|
|
|
| 9 |
self._process = []
|
| 10 |
# whisper 转录
|
| 11 |
self._whisper_pipe_en = self._launch_process(WhisperPipe())
|
| 12 |
+
# self._whisper_pipe_zh = self._launch_process(WhisperChinese())
|
| 13 |
self._funasr_pipe = self._launch_process(FunASRPipe())
|
| 14 |
|
| 15 |
# llm 翻译
|
|
|
|
| 17 |
|
| 18 |
self._translate_7b_pipe = self._launch_process(Translate7BPipe())
|
| 19 |
# vad
|
| 20 |
+
self._vad_pipe = self._launch_process(VadPipe())
|
| 21 |
|
| 22 |
# def reset(self):
|
| 23 |
# self._vad_pipe.reset()
|
transcribe/whisper_llm_serve.py
CHANGED
|
@@ -8,14 +8,14 @@ from typing import List, Optional, Iterator, Tuple, Any
|
|
| 8 |
import asyncio
|
| 9 |
import numpy as np
|
| 10 |
import config
|
| 11 |
-
|
| 12 |
from api_model import TransResult, Message, DebugResult
|
| 13 |
|
| 14 |
from .utils import log_block, save_to_wave, TestDataWriter
|
| 15 |
from .translatepipes import TranslatePipes
|
| 16 |
from .strategy import (
|
| 17 |
TranscriptStabilityAnalyzer, TranscriptToken)
|
| 18 |
-
from transcribe.helpers.vadprocessor import VadProcessor
|
| 19 |
from transcribe.pipelines import MetaItem
|
| 20 |
|
| 21 |
logger = getLogger("TranscriptionService")
|
|
@@ -43,13 +43,18 @@ class WhisperTranscriptionService:
|
|
| 43 |
self.sample_rate = 16000
|
| 44 |
|
| 45 |
self.lock = threading.Lock()
|
| 46 |
-
|
| 47 |
-
self._vad_frame_queue = queue.Queue()
|
| 48 |
|
| 49 |
# 文本分隔符,根据语言设置
|
| 50 |
self.text_separator = self._get_text_separator(language)
|
| 51 |
self.loop = asyncio.get_event_loop()
|
| 52 |
# 发送就绪状态
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
self._transcrible_analysis = None
|
| 55 |
# 启动处理线程
|
|
@@ -58,10 +63,10 @@ class WhisperTranscriptionService:
|
|
| 58 |
|
| 59 |
self.translate_thread = self._start_thread(self._transcription_processing_loop)
|
| 60 |
self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
|
| 61 |
-
if language == "zh":
|
| 62 |
-
|
| 63 |
-
else:
|
| 64 |
-
|
| 65 |
self.row_number = 0
|
| 66 |
# for test
|
| 67 |
self._transcrible_time_cost = 0.
|
|
@@ -111,24 +116,94 @@ class WhisperTranscriptionService:
|
|
| 111 |
"""添加音频帧到处理队列"""
|
| 112 |
self._frame_queue.put(frame_np)
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
def _frame_processing_loop(self) -> None:
|
| 115 |
"""从队列获取音频帧并合并到缓冲区"""
|
| 116 |
while not self._frame_processing_thread_stop.is_set():
|
| 117 |
try:
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
self.
|
|
|
|
|
|
|
| 128 |
except queue.Empty:
|
| 129 |
pass
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
|
|
|
| 132 |
def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
|
| 133 |
"""转录音频并返回转录片段"""
|
| 134 |
log_block("Audio buffer length", f"{audio_buffer.shape[0]/self.sample_rate:.2f}", "s")
|
|
@@ -176,47 +251,7 @@ class WhisperTranscriptionService:
|
|
| 176 |
self._translate_time_cost = round(time_diff, 3)
|
| 177 |
return translated_text
|
| 178 |
|
| 179 |
-
def _transcription_processing_loop(self) -> None:
|
| 180 |
-
"""主转录处理循环"""
|
| 181 |
-
|
| 182 |
-
while not self._translate_thread_stop.is_set():
|
| 183 |
-
audio_buffer = self._vad_frame_queue.get()
|
| 184 |
-
if audio_buffer is None:
|
| 185 |
-
time.sleep(0.2)
|
| 186 |
-
continue
|
| 187 |
-
if len(audio_buffer) < int(self.sample_rate):
|
| 188 |
-
silence_audio = np.zeros(self.sample_rate, dtype=np.float32)
|
| 189 |
-
silence_audio[-len(audio_buffer):] = audio_buffer
|
| 190 |
-
audio_buffer = silence_audio
|
| 191 |
|
| 192 |
-
logger.debug(f"audio buffer size: {len(audio_buffer) / self.sample_rate:.2f}s")
|
| 193 |
-
# try:
|
| 194 |
-
meta_item = self._transcribe_audio(audio_buffer)
|
| 195 |
-
segments = meta_item.segments
|
| 196 |
-
logger.debug(f"Segments: {segments}")
|
| 197 |
-
if len(segments):
|
| 198 |
-
result = self._process_transcription_results_2(segments)
|
| 199 |
-
self._send_result_to_client(result)
|
| 200 |
-
time.sleep(0.1)
|
| 201 |
-
# 处理转录结果并发送到客户端
|
| 202 |
-
# for result in self._process_transcription_results(segments, audio_buffer):
|
| 203 |
-
# self._send_result_to_client(result)
|
| 204 |
-
|
| 205 |
-
# except Exception as e:
|
| 206 |
-
# logger.error(f"Error processing audio: {e}")
|
| 207 |
-
|
| 208 |
-
def _process_transcription_results_2(self, segments: List[TranscriptToken],):
|
| 209 |
-
seg_text = self.text_separator.join(seg.text for seg in segments)
|
| 210 |
-
item = TransResult(
|
| 211 |
-
seg_id=self.row_number,
|
| 212 |
-
context=seg_text,
|
| 213 |
-
from_=self.source_language,
|
| 214 |
-
to=self.target_language,
|
| 215 |
-
tran_content=self._translate_text_large(seg_text),
|
| 216 |
-
partial=False
|
| 217 |
-
)
|
| 218 |
-
self.row_number += 1
|
| 219 |
-
return item
|
| 220 |
|
| 221 |
def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
|
| 222 |
"""
|
|
|
|
| 8 |
import asyncio
|
| 9 |
import numpy as np
|
| 10 |
import config
|
| 11 |
+
import collections
|
| 12 |
from api_model import TransResult, Message, DebugResult
|
| 13 |
|
| 14 |
from .utils import log_block, save_to_wave, TestDataWriter
|
| 15 |
from .translatepipes import TranslatePipes
|
| 16 |
from .strategy import (
|
| 17 |
TranscriptStabilityAnalyzer, TranscriptToken)
|
| 18 |
+
# from transcribe.helpers.vadprocessor import VadProcessor
|
| 19 |
from transcribe.pipelines import MetaItem
|
| 20 |
|
| 21 |
logger = getLogger("TranscriptionService")
|
|
|
|
| 43 |
self.sample_rate = 16000
|
| 44 |
|
| 45 |
self.lock = threading.Lock()
|
| 46 |
+
|
|
|
|
| 47 |
|
| 48 |
# 文本分隔符,根据语言设置
|
| 49 |
self.text_separator = self._get_text_separator(language)
|
| 50 |
self.loop = asyncio.get_event_loop()
|
| 51 |
# 发送就绪状态
|
| 52 |
+
# 原始音频队列
|
| 53 |
+
self._frame_queue = queue.Queue()
|
| 54 |
+
# 音频队列缓冲区
|
| 55 |
+
self.frames_np = None
|
| 56 |
+
# 完整音频队列
|
| 57 |
+
self.segments_queue = collections.deque()
|
| 58 |
|
| 59 |
self._transcrible_analysis = None
|
| 60 |
# 启动处理线程
|
|
|
|
| 63 |
|
| 64 |
self.translate_thread = self._start_thread(self._transcription_processing_loop)
|
| 65 |
self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
|
| 66 |
+
# if language == "zh":
|
| 67 |
+
# self._vad = VadProcessor(prob_threshold=0.8, silence_s=0.2, cache_s=0.15)
|
| 68 |
+
# else:
|
| 69 |
+
# self._vad = VadProcessor(prob_threshold=0.7, silence_s=0.2, cache_s=0.15)
|
| 70 |
self.row_number = 0
|
| 71 |
# for test
|
| 72 |
self._transcrible_time_cost = 0.
|
|
|
|
| 116 |
"""添加音频帧到处理队列"""
|
| 117 |
self._frame_queue.put(frame_np)
|
| 118 |
|
| 119 |
+
def _apply_voice_activity_detection(self, frame_np:np.array):
|
| 120 |
+
"""应用语音活动检测来优化音频缓冲区"""
|
| 121 |
+
processed_audio = self._translate_pipe.voice_detect(frame_np.tobytes())
|
| 122 |
+
speech_audio = np.frombuffer(processed_audio.audio, dtype=np.float32)
|
| 123 |
+
speech_status = processed_audio.speech_status
|
| 124 |
+
return speech_audio, speech_status
|
| 125 |
+
|
| 126 |
def _frame_processing_loop(self) -> None:
|
| 127 |
"""从队列获取音频帧并合并到缓冲区"""
|
| 128 |
while not self._frame_processing_thread_stop.is_set():
|
| 129 |
try:
|
| 130 |
+
frame_np = self._frame_queue.get(timeout=0.1)
|
| 131 |
+
frame_np, speech_status = self._apply_voice_activity_detection(frame_np)
|
| 132 |
+
if frame_np is None:
|
| 133 |
+
continue
|
| 134 |
+
with self.lock:
|
| 135 |
+
if self.frames_np is None:
|
| 136 |
+
self.frames_np = frame_np.copy()
|
| 137 |
+
else:
|
| 138 |
+
self.frames_np = np.append(self.frames_np, frame_np)
|
| 139 |
+
if speech_status == "END" and len(self.frames_np) > 0:
|
| 140 |
+
self.segments_queue.appendleft(self.frames_np.copy())
|
| 141 |
+
self.frames_np = np.array([], dtype=np.float32)
|
| 142 |
except queue.Empty:
|
| 143 |
pass
|
| 144 |
|
| 145 |
+
def _process_transcription_results_2(self, segments: List[TranscriptToken],partial):
|
| 146 |
+
seg_text = self.text_separator.join(seg.text for seg in segments)
|
| 147 |
+
item = TransResult(
|
| 148 |
+
seg_id=self.row_number,
|
| 149 |
+
context=seg_text,
|
| 150 |
+
from_=self.source_language,
|
| 151 |
+
to=self.target_language,
|
| 152 |
+
tran_content=self._translate_text_large(seg_text),
|
| 153 |
+
partial=partial
|
| 154 |
+
)
|
| 155 |
+
if partial == False:
|
| 156 |
+
self.row_number += 1
|
| 157 |
+
return item
|
| 158 |
+
|
| 159 |
+
def _transcription_processing_loop(self) -> None:
|
| 160 |
+
"""主转录处理循环"""
|
| 161 |
+
frame_epoch = 1
|
| 162 |
+
while not self._translate_thread_stop.is_set():
|
| 163 |
+
|
| 164 |
+
if self.frames_np is None:
|
| 165 |
+
time.sleep(0.2)
|
| 166 |
+
continue
|
| 167 |
+
|
| 168 |
+
with self.lock:
|
| 169 |
+
if len(self.segments_queue) >0:
|
| 170 |
+
audio_buffer = self.segments_queue.pop()
|
| 171 |
+
partial = False
|
| 172 |
+
else:
|
| 173 |
+
audio_buffer = self.frames_np[:int(frame_epoch * 1.5 * self.sample_rate)]# 获取 1.5s * epoch 个音频长度
|
| 174 |
+
partial = True
|
| 175 |
+
|
| 176 |
+
if len(audio_buffer) ==0:
|
| 177 |
+
time.sleep(0.2)
|
| 178 |
+
continue
|
| 179 |
+
|
| 180 |
+
if len(audio_buffer) < int(self.sample_rate):
|
| 181 |
+
silence_audio = np.zeros(self.sample_rate, dtype=np.float32)
|
| 182 |
+
silence_audio[-len(audio_buffer):] = audio_buffer
|
| 183 |
+
audio_buffer = silence_audio
|
| 184 |
+
|
| 185 |
+
logger.debug(f"audio buffer size: {len(audio_buffer) / self.sample_rate:.2f}s")
|
| 186 |
+
# try:
|
| 187 |
+
meta_item = self._transcribe_audio(audio_buffer)
|
| 188 |
+
segments = meta_item.segments
|
| 189 |
+
logger.debug(f"Segments: {segments}")
|
| 190 |
+
if len(segments):
|
| 191 |
+
result = self._process_transcription_results_2(segments, partial)
|
| 192 |
+
self._send_result_to_client(result)
|
| 193 |
+
time.sleep(0.1)
|
| 194 |
+
|
| 195 |
+
if partial == False:
|
| 196 |
+
frame_epoch = 1
|
| 197 |
+
else:
|
| 198 |
+
frame_epoch += 1
|
| 199 |
+
# 处理转录结果并发送到客户端
|
| 200 |
+
# for result in self._process_transcription_results(segments, audio_buffer):
|
| 201 |
+
# self._send_result_to_client(result)
|
| 202 |
+
|
| 203 |
+
# except Exception as e:
|
| 204 |
+
# logger.error(f"Error processing audio: {e}")
|
| 205 |
|
| 206 |
+
|
| 207 |
def _transcribe_audio(self, audio_buffer: np.ndarray)->MetaItem:
|
| 208 |
"""转录音频并返回转录片段"""
|
| 209 |
log_block("Audio buffer length", f"{audio_buffer.shape[0]/self.sample_rate:.2f}", "s")
|
|
|
|
| 251 |
self._translate_time_cost = round(time_diff, 3)
|
| 252 |
return translated_text
|
| 253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
|
| 256 |
def _process_transcription_results(self, segments: List[TranscriptToken], audio_buffer: np.ndarray) -> Iterator[TransResult]:
|
| 257 |
"""
|