/** | |
* Sample rate of the input audio. | |
* Coindicentally, this is the same for both models (Moonshine and Silero VAD) | |
*/ | |
export const INPUT_SAMPLE_RATE = 16000; | |
const INPUT_SAMPLE_RATE_MS = INPUT_SAMPLE_RATE / 1000; | |
/** | |
* Probabilities ABOVE this value are considered as SPEECH | |
*/ | |
export const SPEECH_THRESHOLD = 0.3; | |
/** | |
* If current state is SPEECH, and the probability of the next state | |
* is below this value, it is considered as NON-SPEECH. | |
*/ | |
export const EXIT_THRESHOLD = 0.1; | |
/** | |
* After each speech chunk, wait for at least this amount of silence | |
* before considering the next chunk as a new speech chunk | |
*/ | |
export const MIN_SILENCE_DURATION_MS = 400; | |
export const MIN_SILENCE_DURATION_SAMPLES = | |
MIN_SILENCE_DURATION_MS * INPUT_SAMPLE_RATE_MS; | |
/** | |
* Pad the speech chunk with this amount each side | |
*/ | |
export const SPEECH_PAD_MS = 80; | |
export const SPEECH_PAD_SAMPLES = SPEECH_PAD_MS * INPUT_SAMPLE_RATE_MS; | |
/** | |
* Final speech chunks below this duration are discarded | |
*/ | |
export const MIN_SPEECH_DURATION_SAMPLES = 250 * INPUT_SAMPLE_RATE_MS; // 250 ms | |
/** | |
* Maximum duration of audio that can be handled by Moonshine | |
*/ | |
export const MAX_BUFFER_DURATION = 30; | |
/** | |
* Size of the incoming buffers | |
*/ | |
export const NEW_BUFFER_SIZE = 512; | |
/** | |
* The number of previous buffers to keep, to ensure the audio is padded correctly | |
*/ | |
export const MAX_NUM_PREV_BUFFERS = Math.ceil( | |
SPEECH_PAD_SAMPLES / NEW_BUFFER_SIZE, | |
); | |