# Copyright 2018 Google, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================


"""Closed form linear regression.

Can be differentiated through.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import numpy as np
import sonnet as snt
import tensorflow as tf

from learning_unsupervised_learning import utils
from learning_unsupervised_learning import variable_replace


def solve_ridge(x, y, ridge_factor):
  with tf.name_scope("solve_ridge"):
    # Added a column of ones to the end of the feature matrix for bias
    A = tf.concat([x, tf.ones((x.shape.as_list()[0], 1))], axis=1)

    # Analytic solution for the ridge regression loss
    inv_target = tf.matmul(A, A, transpose_a=True)
    np_diag_penalty = ridge_factor * np.ones(
        A.shape.as_list()[1], dtype="float32")
    # Remove penalty on bias component of weights
    np_diag_penalty[-1] = 0.
    diag_penalty = tf.constant(np_diag_penalty)
    inv_target += tf.diag(diag_penalty)

    inv = tf.matrix_inverse(inv_target)
    w = tf.matmul(inv, tf.matmul(A, y, transpose_a=True))
    return w


class LinearRegressionMetaObjective(snt.AbstractModule):
  """A meta objective based on training Ridge Regression with analytic solution.

  This is used to evaluate the performance of a given feature set trained in
  some other manner.
  """

  def __init__(self,
               local_device=None,
               remote_device=None,
               zero_one_labels=True,
               normalize_y_hat=True,
               normalize_act=False,
               averages=1,
               ridge_factor=0.1,
               center_y=True,
               hinge_loss=False,
               samples_per_class=10,
               test_train_scalar=1.0,
              ):
    self._local_device = local_device
    self._remote_device = remote_device
    self.zero_one_labels = zero_one_labels
    self.normalize_y_hat = normalize_y_hat
    self.normalize_act = normalize_act
    self.ridge_factor = ridge_factor
    self.averages = averages
    self.samples_per_class = samples_per_class
    self.center_y=center_y
    self.test_train_scalar=test_train_scalar
    self.hinge_loss = hinge_loss

    self.dataset_map = {}

    super(LinearRegressionMetaObjective,
          self).__init__(name="LinearRegressionMetaObjective")

  def _build(self, dataset, feature_transformer):
    if self.samples_per_class is not None:
      if dataset not in self.dataset_map:
        # datasets are outside of frames from while loops
        with tf.control_dependencies(None):
          self.dataset_map[dataset] = utils.sample_n_per_class(
              dataset, self.samples_per_class)

      dataset = self.dataset_map[dataset]

    stats = collections.defaultdict(list)
    losses = []
    # TODO(lmetz) move this to ingraph control flow?
    for _ in xrange(self.averages):
      loss, stat = self._build_once(dataset, feature_transformer)
      losses.append(loss)
      for k, v in stat.items():
        stats[k].append(v)
    stats = {k: tf.add_n(v) / float(len(v)) for k, v in stats.items()}

    summary_updates = []
    for k, v in stats.items():
      tf.summary.scalar(k, v)

    with tf.control_dependencies(summary_updates):
      return tf.add_n(losses) / float(len(losses))

  def _build_once(self, dataset, feature_transformer):
    with tf.device(self._local_device):
      batch = dataset()
      num_classes = batch.label_onehot.shape.as_list()[1]

      regression_mod = snt.Linear(num_classes)

      if self.normalize_act:

        def normalize_transformer(x):
          unnorm_x = feature_transformer(x)
          return tf.nn.l2_normalize(unnorm_x, 0)

        feature_transformer_wrap = normalize_transformer
      else:
        feature_transformer_wrap = feature_transformer

      # construct the variables of the right shape in the sonnet module by
      # calling a forward pass through the regressor.
      with utils.assert_no_new_variables():
        dummy_features = feature_transformer_wrap(batch)
      regression_mod(dummy_features)
      reg_w = regression_mod.w
      reg_b = regression_mod.b

      batch_test = dataset()
      all_batch = utils.structure_map_multi(lambda x: tf.concat(x, 0), [batch, batch_test])
      #all_batch = tf.concat([batch, batch_test], 0)
      # Grab a new batch of data from the dataset.
      features = feature_transformer_wrap(all_batch)
      features, features_test = utils.structure_map_split(lambda x: tf.split(x, 2, axis=0), features)

      def center_y(y):
        y -= tf.reduce_mean(y)
        y *= tf.rsqrt(tf.reduce_mean(tf.reduce_sum(y**2, axis=[1], keep_dims=True)))
        return y
      def get_y_vec(batch):
        y_pieces = []
        if hasattr(batch, "label_onehot"):
          if self.zero_one_labels:
            y_pieces += [batch.label_onehot]
          else:
            y_pieces += [2. * batch.label_onehot - 1.]
        if hasattr(batch, "regression_target"):
          y_pieces += [batch.regression_target]
        y = tf.concat(y_pieces, 1)
        if self.center_y:
          y = center_y(y)
        return y

      y_train = get_y_vec(batch)

      w = solve_ridge(features, y_train, self.ridge_factor)

      # Generate features from another batch to evaluate loss on the validation
      # set. This provide a less overfit signal to the learned optimizer.
      y_test = get_y_vec(batch_test)

      def compute_logit(features):
        # We have updated the classifier mod in previous steps, we need to
        # substitute out those variables to get new values.
        replacement = collections.OrderedDict([(reg_w, w[:-1]), (reg_b, w[-1])])
        with variable_replace.variable_replace(replacement):
          logits = regression_mod(features)

        return logits

      batch_size = y_train.shape.as_list()[0]

      logit_train = compute_logit(features)
      logit_test_unnorm = compute_logit(features_test)
      if self.normalize_y_hat:
        logit_test = logit_test_unnorm / tf.sqrt(
            tf.reduce_sum(logit_test_unnorm**2, axis=[1], keep_dims=True))
      else:
        logit_test = logit_test_unnorm

      stats = {}

      if self.hinge_loss:
        # slightly closer to the true classification loss
        # any distance smaller than 1 is guaranteed to map to the correct class
        mse_test = tf.reduce_sum(tf.nn.relu(tf.reduce_sum(tf.square(logit_test - y_test), axis=1)-1.)) / batch_size
      else:
        mse_test = tf.reduce_sum(tf.square(logit_test - y_test)) / batch_size

      stats["mse_test"] = mse_test

      mse_train = tf.reduce_sum(tf.square(logit_train - y_train)) / batch_size
      stats["mse_train"] = mse_train

      is_correct_test = tf.equal(tf.argmax(logit_test, 1), tf.argmax(y_test, 1))
      accuracy_test = tf.reduce_mean(tf.cast(is_correct_test, tf.float32))
      stats["accuracy_test"] = accuracy_test

      def test_confusion_fn():
        test_confusion = tf.confusion_matrix(tf.argmax(y_test, 1), tf.argmax(logit_test, 1))
        test_confusion = tf.to_float(test_confusion) / tf.constant((logit_test.shape.as_list()[0] / float(logit_test.shape.as_list()[1])), dtype=tf.float32)
        test_confusion = tf.expand_dims(tf.expand_dims(test_confusion, 0), 3)
        return test_confusion
      tf.summary.image("test_confusion", test_confusion_fn())

      def train_confusion_fn():
        train_confusion = tf.confusion_matrix(tf.argmax(y_train, 1), tf.argmax(logit_train, 1))
        train_confusion = tf.to_float(train_confusion) / tf.constant((logit_train.shape.as_list()[0] / float(logit_train.shape.as_list()[1])), dtype=tf.float32)
        train_confusion = tf.expand_dims(tf.expand_dims(train_confusion, 0), 3)
        return train_confusion
      tf.summary.image("train_confusion", train_confusion_fn())

      is_correct = tf.equal(tf.argmax(logit_train, 1), tf.argmax(y_train, 1))
      accuracy_train = tf.reduce_mean(tf.cast(is_correct, tf.float32))
      stats["accuracy_train"] = accuracy_train

      reg = self.ridge_factor * tf.reduce_sum(tf.square(w[:-1])) / batch_size
      stats["ridge_component"] = reg

      stats["total_loss"] = mse_test + reg

      loss_to_train_at = (reg+ mse_test) * self.test_train_scalar + (mse_train + reg)*(1 - self.test_train_scalar)

      loss_to_train_at = tf.identity(loss_to_train_at)

      # Minimizing the test loss should not require regurization because the
      # metaobjective is solved for the training loss
      return loss_to_train_at, stats

  def local_variables(self):
    """List of variables that need to be updated for each evaluation.

    These variables should not be stored on a parameter server and
    should be reset every computation of a meta_objective loss.

    Returns:
      vars: list of tf.Variable
    """
    return list(
        snt.get_variables_in_module(self, tf.GraphKeys.TRAINABLE_VARIABLES))

  def remote_variables(self):
    return []