File size: 3,946 Bytes
cd5fcb4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
'''
SST - binary classification
'''
from __future__ import absolute_import, division, unicode_literals
import os
import io
import logging
import numpy as np
from senteval.tools.validation import SplitClassifier
class SSTEval(object):
def __init__(self, task_path, nclasses=2, seed=1111):
self.seed = seed
# binary of fine-grained
assert nclasses in [2, 5]
self.nclasses = nclasses
self.task_name = 'Binary' if self.nclasses == 2 else 'Fine-Grained'
logging.debug('***** Transfer task : SST %s classification *****\n\n', self.task_name)
train = self.loadFile(os.path.join(task_path, 'sentiment-train'))
dev = self.loadFile(os.path.join(task_path, 'sentiment-dev'))
test = self.loadFile(os.path.join(task_path, 'sentiment-test'))
self.sst_data = {'train': train, 'dev': dev, 'test': test}
def do_prepare(self, params, prepare):
samples = self.sst_data['train']['X'] + self.sst_data['dev']['X'] + \
self.sst_data['test']['X']
return prepare(params, samples)
def loadFile(self, fpath):
sst_data = {'X': [], 'y': []}
with io.open(fpath, 'r', encoding='utf-8') as f:
for line in f:
if self.nclasses == 2:
sample = line.strip().split('\t')
sst_data['y'].append(int(sample[1]))
sst_data['X'].append(sample[0].split())
elif self.nclasses == 5:
sample = line.strip().split(' ', 1)
sst_data['y'].append(int(sample[0]))
sst_data['X'].append(sample[1].split())
assert max(sst_data['y']) == self.nclasses - 1
return sst_data
def run(self, params, batcher):
sst_embed = {'train': {}, 'dev': {}, 'test': {}}
bsize = params.batch_size
for key in self.sst_data:
logging.info('Computing embedding for {0}'.format(key))
# Sort to reduce padding
sorted_data = sorted(zip(self.sst_data[key]['X'],
self.sst_data[key]['y']),
key=lambda z: (len(z[0]), z[1]))
self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data))
sst_embed[key]['X'] = []
for ii in range(0, len(self.sst_data[key]['y']), bsize):
batch = self.sst_data[key]['X'][ii:ii + bsize]
embeddings = batcher(params, batch)
sst_embed[key]['X'].append(embeddings)
sst_embed[key]['X'] = np.vstack(sst_embed[key]['X'])
sst_embed[key]['y'] = np.array(self.sst_data[key]['y'])
logging.info('Computed {0} embeddings'.format(key))
config_classifier = {'nclasses': self.nclasses, 'seed': self.seed,
'usepytorch': params.usepytorch,
'classifier': params.classifier}
clf = SplitClassifier(X={'train': sst_embed['train']['X'],
'valid': sst_embed['dev']['X'],
'test': sst_embed['test']['X']},
y={'train': sst_embed['train']['y'],
'valid': sst_embed['dev']['y'],
'test': sst_embed['test']['y']},
config=config_classifier)
devacc, testacc = clf.run()
logging.debug('\nDev acc : {0} Test acc : {1} for \
SST {2} classification\n'.format(devacc, testacc, self.task_name))
return {'devacc': devacc, 'acc': testacc,
'ndev': len(sst_embed['dev']['X']),
'ntest': len(sst_embed['test']['X'])}
|