# Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, check out LICENSE.md """Utils for the pix2pixHD model.""" import numpy as np import torch from imaginaire.utils.data import get_paired_input_label_channel_number from imaginaire.utils.distributed import dist_all_gather_tensor, is_master from imaginaire.utils.distributed import master_only_print as print from imaginaire.utils.trainer import (get_optimizer, get_optimizer_for_params, wrap_model_and_optimizer) from sklearn.cluster import KMeans def cluster_features(cfg, train_data_loader, net_E, preprocess=None, small_ratio=0.0625, is_cityscapes=True): r"""Use clustering to compute the features. Args: cfg (obj): Global configuration file. train_data_loader (obj): Dataloader for iterate through the training set. net_E (nn.Module): Pytorch network. preprocess (function): Pre-processing function. small_ratio (float): We only consider instance that at least occupy $(small_ratio) amount of image space. is_cityscapes (bool): Is this is the cityscape dataset? In the Cityscapes dataset, the instance labels for car start with 26001, 26002, ... Returns: ( num_labels x num_cluster_centers x feature_dims): cluster centers. """ # Encode features. label_nc = get_paired_input_label_channel_number(cfg.data) feat_nc = cfg.gen.enc.num_feat_channels n_clusters = getattr(cfg.gen.enc, 'num_clusters', 10) # Compute features. features = {} for label in range(label_nc): features[label] = np.zeros((0, feat_nc + 1)) for data in train_data_loader: if preprocess is not None: data = preprocess(data) feat = encode_features(net_E, feat_nc, label_nc, data['images'], data['instance_maps'], is_cityscapes) # We only collect the feature vectors for the master GPU. if is_master(): for label in range(label_nc): features[label] = np.append( features[label], feat[label], axis=0) # Clustering. # We only perform clustering for the master GPU. if is_master(): for label in range(label_nc): feat = features[label] # We only consider segments that are greater than a pre-set # threshold. feat = feat[feat[:, -1] > small_ratio, :-1] if feat.shape[0]: n_clusters = min(feat.shape[0], n_clusters) kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(feat) n, d = kmeans.cluster_centers_.shape this_cluster = getattr(net_E, 'cluster_%d' % label) this_cluster[0:n, :] = torch.Tensor( kmeans.cluster_centers_).float() def encode_features(net_E, feat_nc, label_nc, image, inst, is_cityscapes=True): r"""Compute feature embeddings for an image image. TODO(Ting-Chun): To make this funciton dataset independent. Args: net_E (nn.Module): The encoder network. feat_nc (int): Feature dimensions label_nc (int): Number of segmentation labels. image (tensor): Input image tensor. inst (tensor): Input instance map. is_cityscapes (bool): Is this is the cityscape dataset? In the Cityscapes dataset, the instance labels for car start with 26001, 26002, ... Returns: (list of list of numpy vectors): We will have $(label_nc) list. For each list, it will record a list of feature vectors of dimension $(feat_nc+1) where the first $(feat_nc) dimensions is the representative feature of an instance and the last dimension is the proportion. """ # h, w = inst.size()[2:] feat_map = net_E(image, inst) feature_map_gather = dist_all_gather_tensor(feat_map) inst_gathered = dist_all_gather_tensor(inst) # Initialize the cluster centers. # For each feature vector, # 0:feat_nc will be the feature vector. # The feat_nc dimension record the percentage of the instance. feature = {} for i in range(label_nc): feature[i] = np.zeros((0, feat_nc + 1)) if is_master(): all_feat_map = torch.cat(feature_map_gather, 0) all_inst_map = torch.cat(inst_gathered, 0) # Scan through the batches. for n in range(all_feat_map.size()[0]): feat_map = all_feat_map[n:(n + 1), :, :, :] inst = all_inst_map[n:(n + 1), :, :, :] fh, fw = feat_map.size()[2:] inst_np = inst.cpu().numpy().astype(int) for i in np.unique(inst_np): if is_cityscapes: label = i if i < 1000 else i // 1000 else: label = i idx = (inst == int(i)).nonzero() num = idx.size()[0] # We will just pick the middle pixel as its representative # feature. idx = idx[num // 2, :] val = np.zeros((1, feat_nc + 1)) for k in range(feat_nc): # We expect idx[0]=0 and idx[1]=0 as the number of sample # per processing is 1 (idx[0]=0) and the channel number of # the instance map is 1. val[0, k] = feat_map[ idx[0], idx[1] + k, idx[2], idx[3]].item() val[0, feat_nc] = float(num) / (fh * fw) feature[label] = np.append(feature[label], val, axis=0) return feature else: return feature def get_edges(t): r""" Compute edge maps for a given input instance map. Args: t (4D tensor): Input instance map. Returns: (4D tensor): Output edge map. """ edge = torch.cuda.ByteTensor(t.size()).zero_() edge[:, :, :, 1:] = edge[:, :, :, 1:] | ( t[:, :, :, 1:] != t[:, :, :, :-1]).byte() edge[:, :, :, :-1] = edge[:, :, :, :-1] | ( t[:, :, :, 1:] != t[:, :, :, :-1]).byte() edge[:, :, 1:, :] = edge[:, :, 1:, :] | ( t[:, :, 1:, :] != t[:, :, :-1, :]).byte() edge[:, :, :-1, :] = edge[:, :, :-1, :] | ( t[:, :, 1:, :] != t[:, :, :-1, :]).byte() return edge.float() def get_train_params(net, param_names_start_with=[], param_names_include=[]): r"""Get train parameters. Args: net (obj): Network object. param_names_start_with (list of strings): Params whose names start with any of the strings will be trained. param_names_include (list of strings): Params whose names include any of the strings will be trained. """ params_to_train = [] params_dict = net.state_dict() list_of_param_names_to_train = set() # Iterate through all params in the network and check if we need to # train it. for key, value in params_dict.items(): do_train = False # If the param name starts with the target string (excluding # the 'module' part etc), we will train this param. key_s = key.replace('module.', '').replace('averaged_model.', '') for param_name in param_names_start_with: if key_s.startswith(param_name): do_train = True list_of_param_names_to_train.add(param_name) # Otherwise, if the param name includes the target string, # we will also train it. if not do_train: for param_name in param_names_include: if param_name in key_s: do_train = True full_param_name = \ key_s[:(key_s.find(param_name) + len(param_name))] list_of_param_names_to_train.add(full_param_name) # If we decide to train the param, add it to the list to train. if do_train: module = net key_list = key.split('.') for k in key_list: module = getattr(module, k) params_to_train += [module] print('Training layers: ', sorted(list_of_param_names_to_train)) return params_to_train def get_optimizer_with_params(cfg, net_G, net_D, param_names_start_with=[], param_names_include=[]): r"""Return the optimizer object. Args: cfg (obj): Global config. net_G (obj): Generator network. net_D (obj): Discriminator network. param_names_start_with (list of strings): Params whose names start with any of the strings will be trained. param_names_include (list of strings): Params whose names include any of the strings will be trained. """ # If any of the param name lists is not empty, will only train # these params. Otherwise will train the entire network (all params). if param_names_start_with or param_names_include: params = get_train_params(net_G, param_names_start_with, param_names_include) else: params = net_G.parameters() opt_G = get_optimizer_for_params(cfg.gen_opt, params) opt_D = get_optimizer(cfg.dis_opt, net_D) return wrap_model_and_optimizer(cfg, net_G, net_D, opt_G, opt_D)