Spaces:

NCTCMumbai
/

NCTC

Running

App Files Files Community

NCTC / models /research /audioset /vggish /vggish_train_demo.py

NCTCMumbai

Upload 2571 files

0b8359d almost 2 years ago

raw

history blame contribute delete

7.95 kB

	# Copyright 2017 The TensorFlow Authors All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================

	r"""A simple demonstration of running VGGish in training mode.

	This is intended as a toy example that demonstrates how to use the VGGish model
	definition within a larger model that adds more layers on top, and then train
	the larger model. If you let VGGish train as well, then this allows you to
	fine-tune the VGGish model parameters for your application. If you don't let
	VGGish train, then you use VGGish as a feature extractor for the layers above
	it.

	For this toy task, we are training a classifier to distinguish between three
	classes: sine waves, constant signals, and white noise. We generate synthetic
	waveforms from each of these classes, convert into shuffled batches of log mel
	spectrogram examples with associated labels, and feed the batches into a model
	that includes VGGish at the bottom and a couple of additional layers on top. We
	also plumb in labels that are associated with the examples, which feed a label
	loss used for training.

	Usage:
	# Run training for 100 steps using a model checkpoint in the default
	# location (vggish_model.ckpt in the current directory). Allow VGGish
	# to get fine-tuned.
	$ python vggish_train_demo.py --num_batches 100

	# Same as before but run for fewer steps and don't change VGGish parameters
	# and use a checkpoint in a different location
	$ python vggish_train_demo.py --num_batches 50 \
	--train_vggish=False \
	--checkpoint /path/to/model/checkpoint
	"""

	from __future__ import print_function

	from random import shuffle

	import numpy as np
	import tensorflow.compat.v1 as tf
	tf.disable_v2_behavior()
	import tf_slim as slim

	import vggish_input
	import vggish_params
	import vggish_slim

	flags = tf.app.flags

	flags.DEFINE_integer(
	'num_batches', 30,
	'Number of batches of examples to feed into the model. Each batch is of '
	'variable size and contains shuffled examples of each class of audio.')

	flags.DEFINE_boolean(
	'train_vggish', True,
	'If True, allow VGGish parameters to change during training, thus '
	'fine-tuning VGGish. If False, VGGish parameters are fixed, thus using '
	'VGGish as a fixed feature extractor.')

	flags.DEFINE_string(
	'checkpoint', 'vggish_model.ckpt',
	'Path to the VGGish checkpoint file.')

	FLAGS = flags.FLAGS

	_NUM_CLASSES = 3


	def _get_examples_batch():
	"""Returns a shuffled batch of examples of all audio classes.

	Note that this is just a toy function because this is a simple demo intended
	to illustrate how the training code might work.

	Returns:
	a tuple (features, labels) where features is a NumPy array of shape
	[batch_size, num_frames, num_bands] where the batch_size is variable and
	each row is a log mel spectrogram patch of shape [num_frames, num_bands]
	suitable for feeding VGGish, while labels is a NumPy array of shape
	[batch_size, num_classes] where each row is a multi-hot label vector that
	provides the labels for corresponding rows in features.
	"""
	# Make a waveform for each class.
	num_seconds = 5
	sr = 44100 # Sampling rate.
	t = np.linspace(0, num_seconds, int(num_seconds * sr)) # Time axis.
	# Random sine wave.
	freq = np.random.uniform(100, 1000)
	sine = np.sin(2 * np.pi * freq * t)
	# Random constant signal.
	magnitude = np.random.uniform(-1, 1)
	const = magnitude * t
	# White noise.
	noise = np.random.normal(-1, 1, size=t.shape)

	# Make examples of each signal and corresponding labels.
	# Sine is class index 0, Const class index 1, Noise class index 2.
	sine_examples = vggish_input.waveform_to_examples(sine, sr)
	sine_labels = np.array([[1, 0, 0]] * sine_examples.shape[0])
	const_examples = vggish_input.waveform_to_examples(const, sr)
	const_labels = np.array([[0, 1, 0]] * const_examples.shape[0])
	noise_examples = vggish_input.waveform_to_examples(noise, sr)
	noise_labels = np.array([[0, 0, 1]] * noise_examples.shape[0])

	# Shuffle (example, label) pairs across all classes.
	all_examples = np.concatenate((sine_examples, const_examples, noise_examples))
	all_labels = np.concatenate((sine_labels, const_labels, noise_labels))
	labeled_examples = list(zip(all_examples, all_labels))
	shuffle(labeled_examples)

	# Separate and return the features and labels.
	features = [example for (example, _) in labeled_examples]
	labels = [label for (_, label) in labeled_examples]
	return (features, labels)


	def main(_):
	with tf.Graph().as_default(), tf.Session() as sess:
	# Define VGGish.
	embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish)

	# Define a shallow classification model and associated training ops on top
	# of VGGish.
	with tf.variable_scope('mymodel'):
	# Add a fully connected layer with 100 units.
	num_units = 100
	fc = slim.fully_connected(embeddings, num_units)

	# Add a classifier layer at the end, consisting of parallel logistic
	# classifiers, one per class. This allows for multi-class tasks.
	logits = slim.fully_connected(
	fc, _NUM_CLASSES, activation_fn=None, scope='logits')
	tf.sigmoid(logits, name='prediction')

	# Add training ops.
	with tf.variable_scope('train'):
	global_step = tf.Variable(
	0, name='global_step', trainable=False,
	collections=[tf.GraphKeys.GLOBAL_VARIABLES,
	tf.GraphKeys.GLOBAL_STEP])

	# Labels are assumed to be fed as a batch multi-hot vectors, with
	# a 1 in the position of each positive class label, and 0 elsewhere.
	labels = tf.placeholder(
	tf.float32, shape=(None, _NUM_CLASSES), name='labels')

	# Cross-entropy label loss.
	xent = tf.nn.sigmoid_cross_entropy_with_logits(
	logits=logits, labels=labels, name='xent')
	loss = tf.reduce_mean(xent, name='loss_op')
	tf.summary.scalar('loss', loss)

	# We use the same optimizer and hyperparameters as used to train VGGish.
	optimizer = tf.train.AdamOptimizer(
	learning_rate=vggish_params.LEARNING_RATE,
	epsilon=vggish_params.ADAM_EPSILON)
	optimizer.minimize(loss, global_step=global_step, name='train_op')

	# Initialize all variables in the model, and then load the pre-trained
	# VGGish checkpoint.
	sess.run(tf.global_variables_initializer())
	vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)

	# Locate all the tensors and ops we need for the training loop.
	features_tensor = sess.graph.get_tensor_by_name(
	vggish_params.INPUT_TENSOR_NAME)
	labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0')
	global_step_tensor = sess.graph.get_tensor_by_name(
	'mymodel/train/global_step:0')
	loss_tensor = sess.graph.get_tensor_by_name('mymodel/train/loss_op:0')
	train_op = sess.graph.get_operation_by_name('mymodel/train/train_op')

	# The training loop.
	for _ in range(FLAGS.num_batches):
	(features, labels) = _get_examples_batch()
	[num_steps, loss, _] = sess.run(
	[global_step_tensor, loss_tensor, train_op],
	feed_dict={features_tensor: features, labels_tensor: labels})
	print('Step %d: loss %g' % (num_steps, loss))

	if __name__ == '__main__':
	tf.app.run()