|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
''' |
|
MRPC : Microsoft Research Paraphrase (detection) Corpus |
|
''' |
|
from __future__ import absolute_import, division, unicode_literals |
|
|
|
import os |
|
import logging |
|
import numpy as np |
|
import io |
|
|
|
from senteval.tools.validation import KFoldClassifier |
|
|
|
from sklearn.metrics import f1_score |
|
|
|
|
|
class MRPCEval(object): |
|
def __init__(self, task_path, seed=1111): |
|
logging.info('***** Transfer task : MRPC *****\n\n') |
|
self.seed = seed |
|
train = self.loadFile(os.path.join(task_path, |
|
'msr_paraphrase_train.txt')) |
|
test = self.loadFile(os.path.join(task_path, |
|
'msr_paraphrase_test.txt')) |
|
self.mrpc_data = {'train': train, 'test': test} |
|
|
|
def do_prepare(self, params, prepare): |
|
|
|
samples = self.mrpc_data['train']['X_A'] + \ |
|
self.mrpc_data['train']['X_B'] + \ |
|
self.mrpc_data['test']['X_A'] + self.mrpc_data['test']['X_B'] |
|
return prepare(params, samples) |
|
|
|
def loadFile(self, fpath): |
|
mrpc_data = {'X_A': [], 'X_B': [], 'y': []} |
|
with io.open(fpath, 'r', encoding='utf-8') as f: |
|
for line in f: |
|
text = line.strip().split('\t') |
|
mrpc_data['X_A'].append(text[3].split()) |
|
mrpc_data['X_B'].append(text[4].split()) |
|
mrpc_data['y'].append(text[0]) |
|
|
|
mrpc_data['X_A'] = mrpc_data['X_A'][1:] |
|
mrpc_data['X_B'] = mrpc_data['X_B'][1:] |
|
mrpc_data['y'] = [int(s) for s in mrpc_data['y'][1:]] |
|
return mrpc_data |
|
|
|
def run(self, params, batcher): |
|
mrpc_embed = {'train': {}, 'test': {}} |
|
|
|
for key in self.mrpc_data: |
|
logging.info('Computing embedding for {0}'.format(key)) |
|
|
|
text_data = {} |
|
sorted_corpus = sorted(zip(self.mrpc_data[key]['X_A'], |
|
self.mrpc_data[key]['X_B'], |
|
self.mrpc_data[key]['y']), |
|
key=lambda z: (len(z[0]), len(z[1]), z[2])) |
|
|
|
text_data['A'] = [x for (x, y, z) in sorted_corpus] |
|
text_data['B'] = [y for (x, y, z) in sorted_corpus] |
|
text_data['y'] = [z for (x, y, z) in sorted_corpus] |
|
|
|
for txt_type in ['A', 'B']: |
|
mrpc_embed[key][txt_type] = [] |
|
for ii in range(0, len(text_data['y']), params.batch_size): |
|
batch = text_data[txt_type][ii:ii + params.batch_size] |
|
embeddings = batcher(params, batch) |
|
mrpc_embed[key][txt_type].append(embeddings) |
|
mrpc_embed[key][txt_type] = np.vstack(mrpc_embed[key][txt_type]) |
|
mrpc_embed[key]['y'] = np.array(text_data['y']) |
|
logging.info('Computed {0} embeddings'.format(key)) |
|
|
|
|
|
trainA = mrpc_embed['train']['A'] |
|
trainB = mrpc_embed['train']['B'] |
|
trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] |
|
trainY = mrpc_embed['train']['y'] |
|
|
|
|
|
testA = mrpc_embed['test']['A'] |
|
testB = mrpc_embed['test']['B'] |
|
testF = np.c_[np.abs(testA - testB), testA * testB] |
|
testY = mrpc_embed['test']['y'] |
|
|
|
config = {'nclasses': 2, 'seed': self.seed, |
|
'usepytorch': params.usepytorch, |
|
'classifier': params.classifier, |
|
'nhid': params.nhid, 'kfold': params.kfold} |
|
clf = KFoldClassifier(train={'X': trainF, 'y': trainY}, |
|
test={'X': testF, 'y': testY}, config=config) |
|
|
|
devacc, testacc, yhat = clf.run() |
|
testf1 = round(100*f1_score(testY, yhat), 2) |
|
logging.debug('Dev acc : {0} Test acc {1}; Test F1 {2} for MRPC.\n' |
|
.format(devacc, testacc, testf1)) |
|
return {'devacc': devacc, 'acc': testacc, 'f1': testf1, |
|
'ndev': len(trainA), 'ntest': len(testA)} |
|
|