Spaces:

qgyd2021
/

cc_vad

Running

File size: 2,882 Bytes

6efeebe

#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
from typing import List

import librosa
import numpy as np
from pydub import AudioSegment
from scipy.io import wavfile

from project_settings import project_path


def score_transform(x: float, stages: List[float], scores: List[float], ndigits: int = 4):
    last_stage = stages[0]
    last_score = scores[0]
    stages = stages[1:]
    scores = scores[1:]
    for stage, score in zip(stages, scores):
        if x >= stage:
            result = score + (x - stage) / (last_stage - stage + 1e-7) * (last_score - score)
            return round(result, ndigits)
        last_stage = stage
        last_score = score
    raise ValueError(f"values of x, stages and scores should between 0 and 1, "
                     f"stages and scores should be same length and decreased. "
                     f"x: {x}, stages: {stages}, scores: {scores}")


def set_volume(waveform: np.ndarray, sample_rate: int = 8000, volume: int = 0):
    if np.min(waveform) < -1 or np.max(waveform) > 1:
        raise AssertionError(f"waveform type: {type(waveform)}, dtype: {waveform.dtype}")
    waveform = np.array(waveform * (1 << 15), dtype=np.int16)
    raw_data = waveform.tobytes()

    audio_segment = AudioSegment(
        data=raw_data,
        sample_width=2,
        frame_rate=sample_rate,
        channels=1
    )

    map_list = [
        [0, -150],
        [10, -60],
        [50, -35],
        [100, -20],
    ]
    stages = [a for a, b in map_list]
    scores = [b for a, b in map_list]

    # 计算目标 dBFS
    target_db = score_transform(
        x=volume,
        stages=list(reversed(stages)),
        scores=list(reversed(scores)),
    )

    audio_segment = audio_segment.apply_gain(target_db - audio_segment.dBFS)

    samples = np.array(audio_segment.get_array_of_samples())

    if audio_segment.sample_width == 2:
        samples = samples.astype(np.float32) / (1 << (2*8-1))
    elif audio_segment.sample_width == 3:
        samples = samples.astype(np.float32) / (1 << (3*8-1))
    elif audio_segment.sample_width == 4:
        samples = samples.astype(np.float32) / (1 << (4*8-1))
    else:
        raise AssertionError
    return samples


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--filename",
        default=(project_path / "data/examples/ai_agent/chinese-1.wav").as_posix(),
        type=str
    )
    args = parser.parse_args()
    return args


def main():
    args = get_args()

    waveform, sample_rate = librosa.load(args.filename, sr=8000)

    waveform = set_volume(
        waveform=waveform,
        sample_rate=sample_rate,
        volume=10
    )
    waveform = np.array(waveform * (1 << 15), dtype=np.int16)

    wavfile.write(
        "temp.wav",
        rate=8000,
        data=waveform,
    )
    return


if __name__ == "__main__":
    main()