File size: 4,036 Bytes
cb09873
 
 
 
 
 
 
 
 
 
 
 
 
67d83f0
cb09873
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67d83f0
440014c
 
cb09873
67d83f0
 
 
 
 
f7abe49
67d83f0
 
f7abe49
67d83f0
cb09873
 
 
67d83f0
cb09873
67d83f0
 
 
f7abe49
67d83f0
 
 
 
 
f7abe49
 
 
67d83f0
cb09873
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7abe49
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import AutoTokenizer
from datasets import Dataset, load_metric
from sklearn.model_selection import train_test_split
from source.services.predicting_effective_arguments.train.model import TransformersSequenceClassifier

class CFG:
    TARGET = 'discourse_effectiveness'
    TEXT = "discourse_text"
    MODEL_CHECKPOINT = "distilbert-base-uncased"
    MODEL_OUTPUT_DIR ='source/services/predicting_effective_arguments/model/hf_textclassification/predicting_effective_arguments_distilbert'
    model_name="debertav3base"
    learning_rate=1.5e-5
    weight_decay=0.02
    hidden_dropout_prob=0.007
    attention_probs_dropout_prob=0.007
    num_train_epochs=10
    n_splits=4
    batch_size=12
    random_seed=42
    save_steps=100
    max_length=512


def seed_everything(seed: int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


def prepare_input_text(df, sep_token):
    df['inputs'] = df.discourse_type.str.lower() + ' ' + sep_token + ' ' + df.discourse_text.str.lower()
    return df


if __name__ == '__main__':

    config = CFG()
    tokenizer = AutoTokenizer.from_pretrained(config.MODEL_CHECKPOINT)
    seqClassifer = TransformersSequenceClassifier(model_output_dir=config.MODEL_OUTPUT_DIR, tokenizer=tokenizer, model_checkpoint="distilbert-base-uncased", num_labels=3) #distilbert-base-uncased
    data = pd.read_csv("data/raw_data/train.csv")[:100]
    test_df = pd.read_csv("data/raw_data/test.csv")
    train_size = 0.7
    valid_size = 0.2
    test_size = 0.1

    # First split: Separate out the training set
    train_df, temp_df = train_test_split(data, test_size=1 - train_size, random_state=5600)

    # Second split: Separate out the validation and test sets
    valid_df, test_df = train_test_split(temp_df, test_size=test_size / (test_size + valid_size), random_state=5600)


    train_df = prepare_input_text(train_df, sep_token=tokenizer.sep_token)
    valid_df = prepare_input_text(valid_df, sep_token=tokenizer.sep_token)
    test_df = prepare_input_text(test_df, sep_token=tokenizer.sep_token)
    
    train_dataset = Dataset.from_pandas(train_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
    val_dataset = Dataset.from_pandas(valid_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
    test_dataset = Dataset.from_pandas(test_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label")
    labels = train_dataset.features["label"].names
    train_tok_dataset = seqClassifer.tokenize_dataset(dataset=train_dataset)
    val_tok_dataset = seqClassifer.tokenize_dataset(dataset=val_dataset)
    test_tok_dataset = seqClassifer.tokenize_dataset(dataset=test_dataset)

    seqClassifer.train(train_dataset=train_tok_dataset, eval_dataset=val_tok_dataset, epochs=1, batch_size=16)
    y_valid_pred = seqClassifer.predict_valid_data(val_tok_dataset)
    seqClassifer.plot_confusion_matrix(y_preds=y_valid_pred, y_true=val_dataset['label'], labels=labels)
    y_test_pred = seqClassifer.predict_test_data(model_checkpoint=config.MODEL_OUTPUT_DIR, test_list=test_df['inputs'].tolist())
    pass

    """
    train_df[TARGET].value_counts(ascending=True).plot.barh()
    plt.title("Frequency of Classes")
    plt.show()

    train_df['discourse_type'].value_counts(ascending=True).plot.barh()
    plt.title("Frequency of discourse_type")
    plt.show()

    train_df["Words Per text"] = train_df[TEXT].str.split().apply(len)
    train_df.boxplot("Words Per text", by=TARGET, grid=False, showfliers=False,
            color="black")
    plt.suptitle("")
    plt.xlabel("")
    plt.show()
    """