File size: 4,865 Bytes
4f607de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset, load_metric
from sklearn.model_selection import train_test_split
from source.services.predicting_effective_arguments.train.model import TransformersSequenceClassifier

class CFG:
    TARGET = 'discourse_effectiveness'
    TEXT = "discourse_text"
    MODEL_CHECKPOINT = "distilbert-base-uncased"
    MODEL_OUTPUT_DIR ='source/services/predicting_effective_arguments/model/hf_textclassification/predicting_effective_arguments_distilbert'
    model_name="debertav3base"
    learning_rate=1.5e-5
    weight_decay=0.02
    hidden_dropout_prob=0.007
    attention_probs_dropout_prob=0.007
    num_train_epochs=10
    n_splits=4
    batch_size=12
    random_seed=42
    save_steps=100
    max_length=512


def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


def prepare_input_text(df, sep_token):
    df['inputs'] = df.discourse_type.str.lower() + ' ' + sep_token + ' ' + df.discourse_text.str.lower()
    return df


if __name__ == '__main__':

    config = CFG()
    tokenizer = AutoTokenizer.from_pretrained(config.MODEL_CHECKPOINT)

    data = pd.read_csv("data/raw_data/train.csv")[:100]
    label_names = list(data[config.TARGET].unique())
    #score_df = pd.read_csv("data/raw_data/test.csv")

    """
    data[TARGET].value_counts(ascending=True).plot.barh()
    plt.title("Frequency of Classes")
    plt.show()

    data['discourse_type'].value_counts(ascending=True).plot.barh()
    plt.title("Frequency of discourse_type")
    plt.show()

    data["Words Per text"] = data[TEXT].str.split().apply(len)
    data.boxplot("Words Per text", by=TARGET, grid=False, showfliers=False,
            color="black")
    plt.suptitle("")
    plt.xlabel("")
    plt.show()
    """

    train_size = 0.7
    valid_size = 0.2
    test_size = 0.1

    # First split: Separate out the training set
    train_df, temp_df = train_test_split(data, test_size=1 - train_size, random_state=5600)

    # Second split: Separate out the validation and test sets
    valid_df, test_df = train_test_split(temp_df, test_size=test_size / (test_size + valid_size), random_state=5600)


    train_df = prepare_input_text(train_df, sep_token=tokenizer.sep_token)
    valid_df = prepare_input_text(valid_df, sep_token=tokenizer.sep_token)
    test_df = prepare_input_text(test_df, sep_token=tokenizer.sep_token)
    
    train_dataset = Dataset.from_pandas(train_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
    val_dataset = Dataset.from_pandas(valid_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
    test_dataset = Dataset.from_pandas(test_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")

    id2label = {i: label for i, label in enumerate(label_names)}
    label2id = {v: k for k, v in id2label.items()}
    seqClassifer = TransformersSequenceClassifier(model_output_dir=config.MODEL_OUTPUT_DIR,
                                                  tokenizer=tokenizer,
                                                  model_checkpoint="distilbert-base-uncased",
                                                  num_labels=3,
                                                  id2label=id2label,
                                                  label2id=label2id)

    train_tok_dataset = seqClassifer.tokenize_dataset(dataset=train_dataset)
    val_tok_dataset = seqClassifer.tokenize_dataset(dataset=val_dataset)
    test_tok_dataset = seqClassifer.tokenize_dataset(dataset=test_dataset)
    

    
    seqClassifer.train(train_dataset=train_tok_dataset, eval_dataset=val_tok_dataset, epochs=1, batch_size=16)

    y_test_pred = seqClassifer.predict_argmax_logit(test_tok_dataset)
    seqClassifer.plot_confusion_matrix(y_preds=y_test_pred, y_true=test_dataset['label'], label_names=label_names)

    y_pred = seqClassifer.predict_pipeline(model_checkpoint=config.MODEL_OUTPUT_DIR, test_list=test_df['inputs'].tolist())
    #hidden = train_tok_dataset.map(seqClassifer.extract_hidden_states, 
    #                                        batched=True, 
    #                                        fn_kwargs={'tokenizer': AutoTokenizer.from_pretrained(config.MODEL_OUTPUT_DIR),
    #                                                   'model': AutoModelForSequenceClassification.from_pretrained(config.MODEL_OUTPUT_DIR)})
    
    pass