# Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # """ Validation and classification (train) : inner-kfold classifier (train, test) : kfold classifier (train, dev, test) : split classifier """ from __future__ import absolute_import, division, unicode_literals import logging import numpy as np from senteval.tools.classifier import MLP import sklearn assert(sklearn.__version__ >= "0.18.0"), \ "need to update sklearn to version >= 0.18.0" from sklearn.linear_model import LogisticRegression from sklearn.model_selection import StratifiedKFold def get_classif_name(classifier_config, usepytorch): if not usepytorch: modelname = 'sklearn-LogReg' else: nhid = classifier_config['nhid'] optim = 'adam' if 'optim' not in classifier_config else classifier_config['optim'] bs = 64 if 'batch_size' not in classifier_config else classifier_config['batch_size'] modelname = 'pytorch-MLP-nhid%s-%s-bs%s' % (nhid, optim, bs) return modelname # Pytorch version class InnerKFoldClassifier(object): """ (train) split classifier : InnerKfold. """ def __init__(self, X, y, config): self.X = X self.y = y self.featdim = X.shape[1] self.nclasses = config['nclasses'] self.seed = config['seed'] self.devresults = [] self.testresults = [] self.usepytorch = config['usepytorch'] self.classifier_config = config['classifier'] self.modelname = get_classif_name(self.classifier_config, self.usepytorch) self.k = 5 if 'kfold' not in config else config['kfold'] def run(self): logging.info('Training {0} with (inner) {1}-fold cross-validation' .format(self.modelname, self.k)) regs = [10**t for t in range(-5, -1)] if self.usepytorch else \ [2**t for t in range(-2, 4, 1)] skf = StratifiedKFold(n_splits=self.k, shuffle=True, random_state=1111) innerskf = StratifiedKFold(n_splits=self.k, shuffle=True, random_state=1111) count = 0 for train_idx, test_idx in skf.split(self.X, self.y): count += 1 X_train, X_test = self.X[train_idx], self.X[test_idx] y_train, y_test = self.y[train_idx], self.y[test_idx] scores = [] for reg in regs: regscores = [] for inner_train_idx, inner_test_idx in innerskf.split(X_train, y_train): X_in_train, X_in_test = X_train[inner_train_idx], X_train[inner_test_idx] y_in_train, y_in_test = y_train[inner_train_idx], y_train[inner_test_idx] if self.usepytorch: clf = MLP(self.classifier_config, inputdim=self.featdim, nclasses=self.nclasses, l2reg=reg, seed=self.seed) clf.fit(X_in_train, y_in_train, validation_data=(X_in_test, y_in_test)) else: clf = LogisticRegression(C=reg, random_state=self.seed) clf.fit(X_in_train, y_in_train) regscores.append(clf.score(X_in_test, y_in_test)) scores.append(round(100*np.mean(regscores), 2)) optreg = regs[np.argmax(scores)] logging.info('Best param found at split {0}: l2reg = {1} \ with score {2}'.format(count, optreg, np.max(scores))) self.devresults.append(np.max(scores)) if self.usepytorch: clf = MLP(self.classifier_config, inputdim=self.featdim, nclasses=self.nclasses, l2reg=optreg, seed=self.seed) clf.fit(X_train, y_train, validation_split=0.05) else: clf = LogisticRegression(C=optreg, random_state=self.seed) clf.fit(X_train, y_train) self.testresults.append(round(100*clf.score(X_test, y_test), 2)) devaccuracy = round(np.mean(self.devresults), 2) testaccuracy = round(np.mean(self.testresults), 2) return devaccuracy, testaccuracy class KFoldClassifier(object): """ (train, test) split classifier : cross-validation on train. """ def __init__(self, train, test, config): self.train = train self.test = test self.featdim = self.train['X'].shape[1] self.nclasses = config['nclasses'] self.seed = config['seed'] self.usepytorch = config['usepytorch'] self.classifier_config = config['classifier'] self.modelname = get_classif_name(self.classifier_config, self.usepytorch) self.k = 5 if 'kfold' not in config else config['kfold'] def run(self): # cross-validation logging.info('Training {0} with {1}-fold cross-validation' .format(self.modelname, self.k)) regs = [10**t for t in range(-5, -1)] if self.usepytorch else \ [2**t for t in range(-1, 6, 1)] skf = StratifiedKFold(n_splits=self.k, shuffle=True, random_state=self.seed) scores = [] for reg in regs: scanscores = [] for train_idx, test_idx in skf.split(self.train['X'], self.train['y']): # Split data X_train, y_train = self.train['X'][train_idx], self.train['y'][train_idx] X_test, y_test = self.train['X'][test_idx], self.train['y'][test_idx] # Train classifier if self.usepytorch: clf = MLP(self.classifier_config, inputdim=self.featdim, nclasses=self.nclasses, l2reg=reg, seed=self.seed) clf.fit(X_train, y_train, validation_data=(X_test, y_test)) else: clf = LogisticRegression(C=reg, random_state=self.seed) clf.fit(X_train, y_train) score = clf.score(X_test, y_test) scanscores.append(score) # Append mean score scores.append(round(100*np.mean(scanscores), 2)) # evaluation logging.info([('reg:' + str(regs[idx]), scores[idx]) for idx in range(len(scores))]) optreg = regs[np.argmax(scores)] devaccuracy = np.max(scores) logging.info('Cross-validation : best param found is reg = {0} \ with score {1}'.format(optreg, devaccuracy)) logging.info('Evaluating...') if self.usepytorch: clf = MLP(self.classifier_config, inputdim=self.featdim, nclasses=self.nclasses, l2reg=optreg, seed=self.seed) clf.fit(self.train['X'], self.train['y'], validation_split=0.05) else: clf = LogisticRegression(C=optreg, random_state=self.seed) clf.fit(self.train['X'], self.train['y']) yhat = clf.predict(self.test['X']) testaccuracy = clf.score(self.test['X'], self.test['y']) testaccuracy = round(100*testaccuracy, 2) return devaccuracy, testaccuracy, yhat class SplitClassifier(object): """ (train, valid, test) split classifier. """ def __init__(self, X, y, config): self.X = X self.y = y self.nclasses = config['nclasses'] self.featdim = self.X['train'].shape[1] self.seed = config['seed'] self.usepytorch = config['usepytorch'] self.classifier_config = config['classifier'] self.cudaEfficient = False if 'cudaEfficient' not in config else \ config['cudaEfficient'] self.modelname = get_classif_name(self.classifier_config, self.usepytorch) self.noreg = False if 'noreg' not in config else config['noreg'] self.config = config def run(self): logging.info('Training {0} with standard validation..' .format(self.modelname)) regs = [10**t for t in range(-5, -1)] if self.usepytorch else \ [2**t for t in range(-2, 4, 1)] if self.noreg: regs = [1e-9 if self.usepytorch else 1e9] scores = [] for reg in regs: if self.usepytorch: clf = MLP(self.classifier_config, inputdim=self.featdim, nclasses=self.nclasses, l2reg=reg, seed=self.seed, cudaEfficient=self.cudaEfficient) # TODO: Find a hack for reducing nb epoches in SNLI clf.fit(self.X['train'], self.y['train'], validation_data=(self.X['valid'], self.y['valid'])) else: clf = LogisticRegression(C=reg, random_state=self.seed) clf.fit(self.X['train'], self.y['train']) scores.append(round(100*clf.score(self.X['valid'], self.y['valid']), 2)) logging.info([('reg:'+str(regs[idx]), scores[idx]) for idx in range(len(scores))]) optreg = regs[np.argmax(scores)] devaccuracy = np.max(scores) logging.info('Validation : best param found is reg = {0} with score \ {1}'.format(optreg, devaccuracy)) clf = LogisticRegression(C=optreg, random_state=self.seed) logging.info('Evaluating...') if self.usepytorch: clf = MLP(self.classifier_config, inputdim=self.featdim, nclasses=self.nclasses, l2reg=optreg, seed=self.seed, cudaEfficient=self.cudaEfficient) # TODO: Find a hack for reducing nb epoches in SNLI clf.fit(self.X['train'], self.y['train'], validation_data=(self.X['valid'], self.y['valid'])) else: clf = LogisticRegression(C=optreg, random_state=self.seed) clf.fit(self.X['train'], self.y['train']) testaccuracy = clf.score(self.X['test'], self.y['test']) testaccuracy = round(100*testaccuracy, 2) return devaccuracy, testaccuracy