Spaces:

NCTCMumbai
/

NCTC

Running

App Files Files Community

NCTC / models /research /audioset /vggish /vggish_input.py

NCTCMumbai

Upload 2571 files

0b8359d almost 2 years ago

raw

history blame contribute delete

3.54 kB

	# Copyright 2017 The TensorFlow Authors All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================

	"""Compute input examples for VGGish from audio waveform."""

	import numpy as np
	import resampy

	import mel_features
	import vggish_params

	try:
	import soundfile as sf

	def wav_read(wav_file):
	wav_data, sr = sf.read(wav_file, dtype='int16')
	return wav_data, sr

	except ImportError:

	def wav_read(wav_file):
	raise NotImplementedError('WAV file reading requires soundfile package.')


	def waveform_to_examples(data, sample_rate):
	"""Converts audio waveform into an array of examples for VGGish.

	Args:
	data: np.array of either one dimension (mono) or two dimensions
	(multi-channel, with the outer dimension representing channels).
	Each sample is generally expected to lie in the range [-1.0, +1.0],
	although this is not required.
	sample_rate: Sample rate of data.

	Returns:
	3-D np.array of shape [num_examples, num_frames, num_bands] which represents
	a sequence of examples, each of which contains a patch of log mel
	spectrogram, covering num_frames frames of audio and num_bands mel frequency
	bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.
	"""
	# Convert to mono.
	if len(data.shape) > 1:
	data = np.mean(data, axis=1)
	# Resample to the rate assumed by VGGish.
	if sample_rate != vggish_params.SAMPLE_RATE:
	data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)

	# Compute log mel spectrogram features.
	log_mel = mel_features.log_mel_spectrogram(
	data,
	audio_sample_rate=vggish_params.SAMPLE_RATE,
	log_offset=vggish_params.LOG_OFFSET,
	window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,
	hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,
	num_mel_bins=vggish_params.NUM_MEL_BINS,
	lower_edge_hertz=vggish_params.MEL_MIN_HZ,
	upper_edge_hertz=vggish_params.MEL_MAX_HZ)

	# Frame features into examples.
	features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS
	example_window_length = int(round(
	vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))
	example_hop_length = int(round(
	vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))
	log_mel_examples = mel_features.frame(
	log_mel,
	window_length=example_window_length,
	hop_length=example_hop_length)
	return log_mel_examples


	def wavfile_to_examples(wav_file):
	"""Convenience wrapper around waveform_to_examples() for a common WAV format.

	Args:
	wav_file: String path to a file, or a file-like object. The file
	is assumed to contain WAV audio data with signed 16-bit PCM samples.

	Returns:
	See waveform_to_examples.
	"""
	wav_data, sr = wav_read(wav_file)
	assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
	samples = wav_data / 32768.0 # Convert to [-1.0, +1.0]
	return waveform_to_examples(samples, sr)