cc_vad / toolbox /pydub /volume.py
HoneyTian's picture
update
6efeebe
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
from typing import List
import librosa
import numpy as np
from pydub import AudioSegment
from scipy.io import wavfile
from project_settings import project_path
def score_transform(x: float, stages: List[float], scores: List[float], ndigits: int = 4):
last_stage = stages[0]
last_score = scores[0]
stages = stages[1:]
scores = scores[1:]
for stage, score in zip(stages, scores):
if x >= stage:
result = score + (x - stage) / (last_stage - stage + 1e-7) * (last_score - score)
return round(result, ndigits)
last_stage = stage
last_score = score
raise ValueError(f"values of x, stages and scores should between 0 and 1, "
f"stages and scores should be same length and decreased. "
f"x: {x}, stages: {stages}, scores: {scores}")
def set_volume(waveform: np.ndarray, sample_rate: int = 8000, volume: int = 0):
if np.min(waveform) < -1 or np.max(waveform) > 1:
raise AssertionError(f"waveform type: {type(waveform)}, dtype: {waveform.dtype}")
waveform = np.array(waveform * (1 << 15), dtype=np.int16)
raw_data = waveform.tobytes()
audio_segment = AudioSegment(
data=raw_data,
sample_width=2,
frame_rate=sample_rate,
channels=1
)
map_list = [
[0, -150],
[10, -60],
[50, -35],
[100, -20],
]
stages = [a for a, b in map_list]
scores = [b for a, b in map_list]
# 计算目标 dBFS
target_db = score_transform(
x=volume,
stages=list(reversed(stages)),
scores=list(reversed(scores)),
)
audio_segment = audio_segment.apply_gain(target_db - audio_segment.dBFS)
samples = np.array(audio_segment.get_array_of_samples())
if audio_segment.sample_width == 2:
samples = samples.astype(np.float32) / (1 << (2*8-1))
elif audio_segment.sample_width == 3:
samples = samples.astype(np.float32) / (1 << (3*8-1))
elif audio_segment.sample_width == 4:
samples = samples.astype(np.float32) / (1 << (4*8-1))
else:
raise AssertionError
return samples
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--filename",
default=(project_path / "data/examples/ai_agent/chinese-1.wav").as_posix(),
type=str
)
args = parser.parse_args()
return args
def main():
args = get_args()
waveform, sample_rate = librosa.load(args.filename, sr=8000)
waveform = set_volume(
waveform=waveform,
sample_rate=sample_rate,
volume=10
)
waveform = np.array(waveform * (1 << 15), dtype=np.int16)
wavfile.write(
"temp.wav",
rate=8000,
data=waveform,
)
return
if __name__ == "__main__":
main()