|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Validation and classification |
|
(train) : inner-kfold classifier |
|
(train, test) : kfold classifier |
|
(train, dev, test) : split classifier |
|
|
|
""" |
|
from __future__ import absolute_import, division, unicode_literals |
|
|
|
import logging |
|
import numpy as np |
|
from senteval.tools.classifier import MLP |
|
|
|
import sklearn |
|
assert(sklearn.__version__ >= "0.18.0"), \ |
|
"need to update sklearn to version >= 0.18.0" |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.model_selection import StratifiedKFold |
|
|
|
|
|
def get_classif_name(classifier_config, usepytorch): |
|
if not usepytorch: |
|
modelname = 'sklearn-LogReg' |
|
else: |
|
nhid = classifier_config['nhid'] |
|
optim = 'adam' if 'optim' not in classifier_config else classifier_config['optim'] |
|
bs = 64 if 'batch_size' not in classifier_config else classifier_config['batch_size'] |
|
modelname = 'pytorch-MLP-nhid%s-%s-bs%s' % (nhid, optim, bs) |
|
return modelname |
|
|
|
|
|
class InnerKFoldClassifier(object): |
|
""" |
|
(train) split classifier : InnerKfold. |
|
""" |
|
def __init__(self, X, y, config): |
|
self.X = X |
|
self.y = y |
|
self.featdim = X.shape[1] |
|
self.nclasses = config['nclasses'] |
|
self.seed = config['seed'] |
|
self.devresults = [] |
|
self.testresults = [] |
|
self.usepytorch = config['usepytorch'] |
|
self.classifier_config = config['classifier'] |
|
self.modelname = get_classif_name(self.classifier_config, self.usepytorch) |
|
|
|
self.k = 5 if 'kfold' not in config else config['kfold'] |
|
|
|
def run(self): |
|
logging.info('Training {0} with (inner) {1}-fold cross-validation' |
|
.format(self.modelname, self.k)) |
|
|
|
regs = [10**t for t in range(-5, -1)] if self.usepytorch else \ |
|
[2**t for t in range(-2, 4, 1)] |
|
skf = StratifiedKFold(n_splits=self.k, shuffle=True, random_state=1111) |
|
innerskf = StratifiedKFold(n_splits=self.k, shuffle=True, |
|
random_state=1111) |
|
count = 0 |
|
for train_idx, test_idx in skf.split(self.X, self.y): |
|
count += 1 |
|
X_train, X_test = self.X[train_idx], self.X[test_idx] |
|
y_train, y_test = self.y[train_idx], self.y[test_idx] |
|
scores = [] |
|
for reg in regs: |
|
regscores = [] |
|
for inner_train_idx, inner_test_idx in innerskf.split(X_train, y_train): |
|
X_in_train, X_in_test = X_train[inner_train_idx], X_train[inner_test_idx] |
|
y_in_train, y_in_test = y_train[inner_train_idx], y_train[inner_test_idx] |
|
if self.usepytorch: |
|
clf = MLP(self.classifier_config, inputdim=self.featdim, |
|
nclasses=self.nclasses, l2reg=reg, |
|
seed=self.seed) |
|
clf.fit(X_in_train, y_in_train, |
|
validation_data=(X_in_test, y_in_test)) |
|
else: |
|
clf = LogisticRegression(C=reg, random_state=self.seed) |
|
clf.fit(X_in_train, y_in_train) |
|
regscores.append(clf.score(X_in_test, y_in_test)) |
|
scores.append(round(100*np.mean(regscores), 2)) |
|
optreg = regs[np.argmax(scores)] |
|
logging.info('Best param found at split {0}: l2reg = {1} \ |
|
with score {2}'.format(count, optreg, np.max(scores))) |
|
self.devresults.append(np.max(scores)) |
|
|
|
if self.usepytorch: |
|
clf = MLP(self.classifier_config, inputdim=self.featdim, |
|
nclasses=self.nclasses, l2reg=optreg, |
|
seed=self.seed) |
|
|
|
clf.fit(X_train, y_train, validation_split=0.05) |
|
else: |
|
clf = LogisticRegression(C=optreg, random_state=self.seed) |
|
clf.fit(X_train, y_train) |
|
|
|
self.testresults.append(round(100*clf.score(X_test, y_test), 2)) |
|
|
|
devaccuracy = round(np.mean(self.devresults), 2) |
|
testaccuracy = round(np.mean(self.testresults), 2) |
|
return devaccuracy, testaccuracy |
|
|
|
|
|
class KFoldClassifier(object): |
|
""" |
|
(train, test) split classifier : cross-validation on train. |
|
""" |
|
def __init__(self, train, test, config): |
|
self.train = train |
|
self.test = test |
|
self.featdim = self.train['X'].shape[1] |
|
self.nclasses = config['nclasses'] |
|
self.seed = config['seed'] |
|
self.usepytorch = config['usepytorch'] |
|
self.classifier_config = config['classifier'] |
|
self.modelname = get_classif_name(self.classifier_config, self.usepytorch) |
|
|
|
self.k = 5 if 'kfold' not in config else config['kfold'] |
|
|
|
def run(self): |
|
|
|
logging.info('Training {0} with {1}-fold cross-validation' |
|
.format(self.modelname, self.k)) |
|
regs = [10**t for t in range(-5, -1)] if self.usepytorch else \ |
|
[2**t for t in range(-1, 6, 1)] |
|
skf = StratifiedKFold(n_splits=self.k, shuffle=True, |
|
random_state=self.seed) |
|
scores = [] |
|
|
|
for reg in regs: |
|
scanscores = [] |
|
for train_idx, test_idx in skf.split(self.train['X'], |
|
self.train['y']): |
|
|
|
X_train, y_train = self.train['X'][train_idx], self.train['y'][train_idx] |
|
|
|
X_test, y_test = self.train['X'][test_idx], self.train['y'][test_idx] |
|
|
|
|
|
if self.usepytorch: |
|
clf = MLP(self.classifier_config, inputdim=self.featdim, |
|
nclasses=self.nclasses, l2reg=reg, |
|
seed=self.seed) |
|
clf.fit(X_train, y_train, validation_data=(X_test, y_test)) |
|
else: |
|
clf = LogisticRegression(C=reg, random_state=self.seed) |
|
clf.fit(X_train, y_train) |
|
score = clf.score(X_test, y_test) |
|
scanscores.append(score) |
|
|
|
scores.append(round(100*np.mean(scanscores), 2)) |
|
|
|
|
|
logging.info([('reg:' + str(regs[idx]), scores[idx]) |
|
for idx in range(len(scores))]) |
|
optreg = regs[np.argmax(scores)] |
|
devaccuracy = np.max(scores) |
|
logging.info('Cross-validation : best param found is reg = {0} \ |
|
with score {1}'.format(optreg, devaccuracy)) |
|
|
|
logging.info('Evaluating...') |
|
if self.usepytorch: |
|
clf = MLP(self.classifier_config, inputdim=self.featdim, |
|
nclasses=self.nclasses, l2reg=optreg, |
|
seed=self.seed) |
|
clf.fit(self.train['X'], self.train['y'], validation_split=0.05) |
|
else: |
|
clf = LogisticRegression(C=optreg, random_state=self.seed) |
|
clf.fit(self.train['X'], self.train['y']) |
|
yhat = clf.predict(self.test['X']) |
|
|
|
testaccuracy = clf.score(self.test['X'], self.test['y']) |
|
testaccuracy = round(100*testaccuracy, 2) |
|
|
|
return devaccuracy, testaccuracy, yhat |
|
|
|
|
|
class SplitClassifier(object): |
|
""" |
|
(train, valid, test) split classifier. |
|
""" |
|
def __init__(self, X, y, config): |
|
self.X = X |
|
self.y = y |
|
self.nclasses = config['nclasses'] |
|
self.featdim = self.X['train'].shape[1] |
|
self.seed = config['seed'] |
|
self.usepytorch = config['usepytorch'] |
|
self.classifier_config = config['classifier'] |
|
self.cudaEfficient = False if 'cudaEfficient' not in config else \ |
|
config['cudaEfficient'] |
|
self.modelname = get_classif_name(self.classifier_config, self.usepytorch) |
|
self.noreg = False if 'noreg' not in config else config['noreg'] |
|
self.config = config |
|
|
|
def run(self): |
|
logging.info('Training {0} with standard validation..' |
|
.format(self.modelname)) |
|
regs = [10**t for t in range(-5, -1)] if self.usepytorch else \ |
|
[2**t for t in range(-2, 4, 1)] |
|
if self.noreg: |
|
regs = [1e-9 if self.usepytorch else 1e9] |
|
scores = [] |
|
for reg in regs: |
|
if self.usepytorch: |
|
clf = MLP(self.classifier_config, inputdim=self.featdim, |
|
nclasses=self.nclasses, l2reg=reg, |
|
seed=self.seed, cudaEfficient=self.cudaEfficient) |
|
|
|
|
|
clf.fit(self.X['train'], self.y['train'], |
|
validation_data=(self.X['valid'], self.y['valid'])) |
|
else: |
|
clf = LogisticRegression(C=reg, random_state=self.seed) |
|
clf.fit(self.X['train'], self.y['train']) |
|
scores.append(round(100*clf.score(self.X['valid'], |
|
self.y['valid']), 2)) |
|
logging.info([('reg:'+str(regs[idx]), scores[idx]) |
|
for idx in range(len(scores))]) |
|
optreg = regs[np.argmax(scores)] |
|
devaccuracy = np.max(scores) |
|
logging.info('Validation : best param found is reg = {0} with score \ |
|
{1}'.format(optreg, devaccuracy)) |
|
clf = LogisticRegression(C=optreg, random_state=self.seed) |
|
logging.info('Evaluating...') |
|
if self.usepytorch: |
|
clf = MLP(self.classifier_config, inputdim=self.featdim, |
|
nclasses=self.nclasses, l2reg=optreg, |
|
seed=self.seed, cudaEfficient=self.cudaEfficient) |
|
|
|
|
|
clf.fit(self.X['train'], self.y['train'], |
|
validation_data=(self.X['valid'], self.y['valid'])) |
|
else: |
|
clf = LogisticRegression(C=optreg, random_state=self.seed) |
|
clf.fit(self.X['train'], self.y['train']) |
|
|
|
testaccuracy = clf.score(self.X['test'], self.y['test']) |
|
testaccuracy = round(100*testaccuracy, 2) |
|
return devaccuracy, testaccuracy |
|
|