Spaces:

qgyd2021
/

cc_vad

Running

App Files Files Community

cc_vad / toolbox /vad /vad.py

HoneyTian

update

5703a24 about 1 month ago

raw

history blame contribute delete

15.2 kB

	#!/usr/bin/python3
	# -- coding: utf-8 --
	import argparse
	import collections
	from functools import lru_cache
	import os
	from pathlib import Path
	import shutil
	import tempfile
	import zipfile
	from typing import List

	import matplotlib.pyplot as plt
	import numpy as np
	from scipy.io import wavfile
	import torch
	import webrtcvad

	from project_settings import project_path
	from toolbox.torch.utils.data.vocabulary import Vocabulary


	class FrameVoiceClassifier(object):
	def predict(self, chunk: np.ndarray) -> float:
	raise NotImplementedError


	class WebRTCVoiceClassifier(FrameVoiceClassifier):
	def __init__(self,
	agg: int = 3,
	sample_rate: int = 8000
	):
	self.agg = agg
	self.sample_rate = sample_rate

	self.model = webrtcvad.Vad(mode=agg)

	def predict(self, chunk: np.ndarray) -> float:
	if chunk.dtype != np.int16:
	raise AssertionError("signal dtype should be np.int16, instead of {}".format(chunk.dtype))

	audio_bytes = bytes(chunk)
	is_speech = self.model.is_speech(audio_bytes, self.sample_rate)
	return 1.0 if is_speech else 0.0


	class SileroVoiceClassifier(FrameVoiceClassifier):
	def __init__(self,
	model_path: str,
	sample_rate: int = 8000):
	self.model_path = model_path
	self.sample_rate = sample_rate

	with open(self.model_path, "rb") as f:
	model = torch.jit.load(f, map_location="cpu")
	self.model = model
	self.model.reset_states()

	def predict(self, chunk: np.ndarray) -> float:
	if self.sample_rate / len(chunk) > 31.25:
	raise AssertionError("chunk samples number {} is less than {}".format(len(chunk), self.sample_rate / 31.25))
	if chunk.dtype != np.int16:
	raise AssertionError("signal dtype should be np.int16, instead of {}".format(chunk.dtype))

	num_samples = len(chunk)
	if self.sample_rate == 8000 and num_samples != 256:
	raise AssertionError(f"win size must be 32 ms for silero vad. ")
	if self.sample_rate == 16000 and num_samples != 512:
	raise AssertionError(f"win size must be 32 ms for silero vad. ")

	chunk = chunk / 32768
	chunk = torch.tensor(chunk, dtype=torch.float32)
	speech_prob = self.model(chunk, self.sample_rate).item()
	return float(speech_prob)


	class CCSoundsClassifier(FrameVoiceClassifier):
	def __init__(self,
	model_path: str,
	sample_rate: int = 8000):
	self.model_path = model_path
	self.sample_rate = sample_rate

	d = self.load_model(Path(model_path))

	model = d["model"]
	vocabulary = d["vocabulary"]

	self.model = model
	self.vocabulary = vocabulary

	@staticmethod
	@lru_cache(maxsize=100)
	def load_model(model_file: Path):
	with zipfile.ZipFile(model_file, "r") as f_zip:
	out_root = Path(tempfile.gettempdir()) / "cc_audio_8"
	if out_root.exists():
	shutil.rmtree(out_root.as_posix())
	out_root.mkdir(parents=True, exist_ok=True)
	f_zip.extractall(path=out_root)

	tgt_path = out_root / model_file.stem
	jit_model_file = tgt_path / "trace_model.zip"
	vocab_path = tgt_path / "vocabulary"

	vocabulary = Vocabulary.from_files(vocab_path.as_posix())

	with open(jit_model_file.as_posix(), "rb") as f:
	model = torch.jit.load(f)
	model.eval()

	shutil.rmtree(tgt_path)

	d = {
	"model": model,
	"vocabulary": vocabulary
	}
	return d

	def predict(self, chunk: np.ndarray) -> float:
	if chunk.dtype != np.int16:
	raise AssertionError("signal dtype should be np.int16, instead of {}".format(chunk.dtype))

	chunk = chunk / (1 << 15)
	inputs = torch.tensor(chunk, dtype=torch.float32)
	inputs = torch.unsqueeze(inputs, dim=0)

	with torch.no_grad():
	logits = self.model(inputs)
	probs = torch.nn.functional.softmax(logits, dim=-1)

	voice_idx = self.vocabulary.get_token_index(token="voice", namespace="labels")

	probs = probs.cpu()

	voice_prob = probs[0][voice_idx]
	return float(voice_prob)


	class Frame(object):
	def __init__(self, signal: np.ndarray, timestamp_s: float):
	self.signal = signal
	self.timestamp_s = timestamp_s


	class RingVad(object):
	def __init__(self,
	model: FrameVoiceClassifier,
	start_ring_rate: float = 0.5,
	end_ring_rate: float = 0.5,
	frame_size_ms: int = 30,
	frame_step_ms: int = 30,
	padding_length_ms: int = 300,
	max_silence_length_ms: int = 300,
	max_speech_length_s: float = 2.0,
	min_speech_length_s: float = 0.3,
	sample_rate: int = 8000
	):
	self.model = model
	self.start_ring_rate = start_ring_rate
	self.end_ring_rate = end_ring_rate
	self.frame_size_ms = frame_size_ms
	self.frame_step_ms = frame_step_ms
	self.padding_length_ms = padding_length_ms
	self.max_silence_length_ms = max_silence_length_ms
	self.max_speech_length_s = max_speech_length_s
	self.min_speech_length_s = min_speech_length_s
	self.sample_rate = sample_rate

	# frames
	self.frame_size = int(sample_rate * (frame_size_ms / 1000.0))
	self.frame_step = int(sample_rate * (frame_step_ms / 1000.0))
	self.frame_timestamp_s = 0.0
	self.signal_cache = np.zeros(shape=(self.frame_size,), dtype=np.int16)

	# segments
	self.num_padding_frames = int(padding_length_ms / frame_step_ms)
	self.ring_buffer = collections.deque(maxlen=self.num_padding_frames)
	self.triggered = False
	self.voiced_frames: List[Frame] = list()
	self.segments = list()

	# vad segments
	self.is_first_segment = True
	self.timestamp_start_s = 0.0
	self.timestamp_end_s = 0.0

	# speech probs
	self.speech_probs: List[float] = list()

	def reset(self):
	# frames
	self.frame_size = int(self.sample_rate * (self.frame_size_ms / 1000.0))
	self.frame_step = int(self.sample_rate * (self.frame_step_ms / 1000.0))
	self.frame_timestamp_s = 0.0
	self.signal_cache = np.zeros(shape=(self.frame_size,), dtype=np.int16)

	# segments
	self.num_padding_frames = int(self.padding_length_ms / self.frame_step_ms)
	self.ring_buffer = collections.deque(maxlen=self.num_padding_frames)
	self.triggered = False
	self.voiced_frames: List[Frame] = list()
	self.segments = list()

	# vad segments
	self.is_first_segment = True
	self.timestamp_start_s = 0.0
	self.timestamp_end_s = 0.0

	# speech probs
	self.speech_probs: List[float] = list()

	def signal_to_frames(self, signal: np.ndarray):
	frames = list()

	l = len(signal)

	duration_s = float(self.frame_step) / self.sample_rate

	for offset in range(0, l - self.frame_size + 1, self.frame_step):
	sub_signal = signal[offset:offset+self.frame_size]
	frame = Frame(sub_signal, self.frame_timestamp_s)
	self.frame_timestamp_s += duration_s

	frames.append(frame)
	return frames

	def segments_generator(self, signal: np.ndarray):
	# signal rounding
	if self.signal_cache is not None:
	signal = np.concatenate([self.signal_cache, signal])

	# rest
	rest = (len(signal) - self.frame_size) % self.frame_step

	if rest == 0:
	self.signal_cache = None
	signal_ = signal
	else:
	self.signal_cache = signal[-rest:]
	signal_ = signal[:-rest]

	# frames
	frames = self.signal_to_frames(signal_)

	for frame in frames:
	speech_prob = self.model.predict(frame.signal)
	self.speech_probs.append(speech_prob)

	if not self.triggered:
	self.ring_buffer.append((frame, speech_prob))
	num_voiced = sum([p for _, p in self.ring_buffer])

	if num_voiced > self.start_ring_rate * self.ring_buffer.maxlen:
	self.triggered = True

	for f, _ in self.ring_buffer:
	self.voiced_frames.append(f)
	continue

	self.voiced_frames.append(frame)
	self.ring_buffer.append((frame, speech_prob))
	num_voiced = sum([p for _, p in self.ring_buffer])

	if num_voiced < self.end_ring_rate * self.ring_buffer.maxlen:
	segment = [
	np.concatenate([f.signal for f in self.voiced_frames]),
	self.voiced_frames[0].timestamp_s,
	self.voiced_frames[-1].timestamp_s,
	]
	yield segment
	self.triggered = False
	self.ring_buffer.clear()
	self.voiced_frames = []
	continue

	def vad_segments_generator(self, segments_generator):
	segments = list(segments_generator)

	for i, segment in enumerate(segments):
	start = round(segment[1], 4)
	end = round(segment[2], 4)

	if self.timestamp_start_s is None and self.timestamp_end_s is None:
	self.timestamp_start_s = start
	self.timestamp_end_s = end
	continue

	if self.timestamp_end_s - self.timestamp_start_s > self.max_speech_length_s:
	end_ = self.timestamp_start_s + self.max_speech_length_s
	vad_segment = [self.timestamp_start_s, end_]
	yield vad_segment
	self.timestamp_start_s = end_

	silence_length_ms = (start - self.timestamp_end_s) * 1000
	if silence_length_ms < self.max_silence_length_ms:
	self.timestamp_end_s = end
	continue

	if self.timestamp_end_s - self.timestamp_start_s < self.min_speech_length_s:
	self.timestamp_start_s = start
	self.timestamp_end_s = end
	continue

	vad_segment = [self.timestamp_start_s, self.timestamp_end_s]
	yield vad_segment
	self.timestamp_start_s = start
	self.timestamp_end_s = end

	def vad(self, signal: np.ndarray) -> List[list]:
	segments = self.segments_generator(signal)
	vad_segments = self.vad_segments_generator(segments)
	vad_segments = list(vad_segments)
	return vad_segments

	def last_vad_segments(self) -> List[list]:
	# last segments
	if len(self.voiced_frames) == 0:
	segments = []
	else:
	segment = [
	np.concatenate([f.signal for f in self.voiced_frames]),
	self.voiced_frames[0].timestamp_s,
	self.voiced_frames[-1].timestamp_s
	]
	segments = [segment]

	# last vad segments
	vad_segments = self.vad_segments_generator(segments)
	vad_segments = list(vad_segments)

	if self.timestamp_end_s > 1e-5 and self.timestamp_end_s > 1e-5:
	vad_segments = vad_segments + [[self.timestamp_start_s, self.timestamp_end_s]]
	return vad_segments


	def process_speech_probs(signal: np.ndarray, speech_probs: List[float], frame_step: int) -> np.ndarray:
	speech_probs_ = list()
	for p in speech_probs[1:]:
	speech_probs_.extend([p] * frame_step)

	pad = (signal.shape[0] - len(speech_probs_))
	speech_probs_ = speech_probs_ + [0.0] * pad
	speech_probs_ = np.array(speech_probs_, dtype=np.float32)

	if len(speech_probs_) != len(signal):
	raise AssertionError
	return speech_probs_


	def make_visualization(signal: np.ndarray, speech_probs, sample_rate: int, vad_segments: list):
	time = np.arange(0, len(signal)) / sample_rate
	plt.figure(figsize=(12, 5))
	plt.plot(time, signal / 32768, color='b')
	plt.plot(time, speech_probs, color='gray')
	for start, end in vad_segments:
	plt.axvline(x=start, ymin=0.15, ymax=0.85, color="g", linestyle="--", label="开始端点")
	plt.axvline(x=end, ymin=0.15, ymax=0.85, color="r", linestyle="--", label="结束端点")

	plt.show()
	return


	def get_args():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--wav_file",
	# default=(project_path / "data/early_media/62/3300999628999191096.wav").as_posix(),
	# default=r"D:/Users/tianx/HuggingDatasets/nx_noise/data/speech/nx-speech/en-PH/2025-05-28/active_media_w_1f650e5c-bd22-4803-bb88-d670b00fccda_30.wav",
	default=r"D:/Users/tianx/HuggingDatasets/nx_noise/data/speech/en-PH/2025-05-15/active_media_r_0617d225-f396-4011-a86e-eaf68cdda5a8_3.wav",
	type=str,
	)
	parser.add_argument(
	"--model_path",
	default=(project_path / "trained_models/silero_vad.jit").as_posix(),
	type=str,
	)
	args = parser.parse_args()
	return args


	SAMPLE_RATE = 8000


	def main():
	args = get_args()

	sample_rate, signal = wavfile.read(args.wav_file)
	if SAMPLE_RATE != sample_rate:
	raise AssertionError

	# model = SileroVoiceClassifier(model_path=args.model_path, sample_rate=SAMPLE_RATE)
	model = WebRTCVoiceClassifier(agg=3, sample_rate=SAMPLE_RATE)
	# model = CallVoiceClassifier(model_path=(project_path / "trained_models/cnn_voicemail_common_20231130").as_posix())

	# silero vad
	ring_vad = RingVad(model=model,
	start_ring_rate=0.2,
	end_ring_rate=0.1,
	frame_size_ms=32,
	frame_step_ms=32,
	padding_length_ms=320,
	max_silence_length_ms=320,
	max_speech_length_s=100,
	min_speech_length_s=0.1,
	sample_rate=SAMPLE_RATE,
	)
	# webrtcvad
	ring_vad = RingVad(model=model,
	start_ring_rate=0.9,
	end_ring_rate=0.1,
	frame_size_ms=30,
	frame_step_ms=30,
	padding_length_ms=30,
	max_silence_length_ms=0,
	max_speech_length_s=100,
	min_speech_length_s=0.1,
	sample_rate=SAMPLE_RATE,
	)
	print(ring_vad)

	vad_segments = list()

	segments = ring_vad.vad(signal)
	vad_segments += segments
	for segment in segments:
	print(segment)

	# last vad segment
	segments = ring_vad.last_vad_segments()
	vad_segments += segments
	for segment in segments:
	print(segment)

	print(ring_vad.speech_probs)
	print(len(ring_vad.speech_probs))

	# speech_probs
	speech_probs = process_speech_probs(
	signal=signal,
	speech_probs=ring_vad.speech_probs,
	frame_step=ring_vad.frame_step,
	)

	# plot
	make_visualization(signal, speech_probs, SAMPLE_RATE, vad_segments)
	return


	if __name__ == "__main__":
	main()