import pandas as pd import numpy as np import matplotlib.pyplot as plt from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification from datasets import Dataset, load_metric from sklearn.model_selection import train_test_split from source.services.predicting_effective_arguments.train.model import TransformersSequenceClassifier class CFG: TARGET = 'discourse_effectiveness' TEXT = "discourse_text" MODEL_CHECKPOINT = "distilbert-base-uncased" MODEL_OUTPUT_DIR ='source/services/predicting_effective_arguments/model/hf_textclassification/predicting_effective_arguments_distilbert' model_name="debertav3base" learning_rate=1.5e-5 weight_decay=0.02 hidden_dropout_prob=0.007 attention_probs_dropout_prob=0.007 num_train_epochs=10 n_splits=4 batch_size=12 random_seed=42 save_steps=100 max_length=512 def seed_everything(seed: int): import random, os import numpy as np import torch random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = True def prepare_input_text(df, sep_token): df['inputs'] = df.discourse_type.str.lower() + ' ' + sep_token + ' ' + df.discourse_text.str.lower() return df if __name__ == '__main__': config = CFG() tokenizer = AutoTokenizer.from_pretrained(config.MODEL_CHECKPOINT) data = pd.read_csv("data/raw_data/train.csv")[:100] label_names = list(data[config.TARGET].unique()) #score_df = pd.read_csv("data/raw_data/test.csv") """ data[TARGET].value_counts(ascending=True).plot.barh() plt.title("Frequency of Classes") plt.show() data['discourse_type'].value_counts(ascending=True).plot.barh() plt.title("Frequency of discourse_type") plt.show() data["Words Per text"] = data[TEXT].str.split().apply(len) data.boxplot("Words Per text", by=TARGET, grid=False, showfliers=False, color="black") plt.suptitle("") plt.xlabel("") plt.show() """ train_size = 0.7 valid_size = 0.2 test_size = 0.1 # First split: Separate out the training set train_df, temp_df = train_test_split(data, test_size=1 - train_size, random_state=5600) # Second split: Separate out the validation and test sets valid_df, test_df = train_test_split(temp_df, test_size=test_size / (test_size + valid_size), random_state=5600) train_df = prepare_input_text(train_df, sep_token=tokenizer.sep_token) valid_df = prepare_input_text(valid_df, sep_token=tokenizer.sep_token) test_df = prepare_input_text(test_df, sep_token=tokenizer.sep_token) train_dataset = Dataset.from_pandas(train_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label") val_dataset = Dataset.from_pandas(valid_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label") test_dataset = Dataset.from_pandas(test_df[['inputs', config.TARGET]]).rename_column(config.TARGET, 'label').class_encode_column("label") id2label = {i: label for i, label in enumerate(label_names)} label2id = {v: k for k, v in id2label.items()} seqClassifer = TransformersSequenceClassifier(model_output_dir=config.MODEL_OUTPUT_DIR, tokenizer=tokenizer, model_checkpoint="distilbert-base-uncased", num_labels=3, id2label=id2label, label2id=label2id) train_tok_dataset = seqClassifer.tokenize_dataset(dataset=train_dataset) val_tok_dataset = seqClassifer.tokenize_dataset(dataset=val_dataset) test_tok_dataset = seqClassifer.tokenize_dataset(dataset=test_dataset) seqClassifer.train(train_dataset=train_tok_dataset, eval_dataset=val_tok_dataset, epochs=1, batch_size=16) y_test_pred = seqClassifer.predict_argmax_logit(test_tok_dataset) seqClassifer.plot_confusion_matrix(y_preds=y_test_pred, y_true=test_dataset['label'], label_names=label_names) y_pred = seqClassifer.predict_pipeline(model_checkpoint=config.MODEL_OUTPUT_DIR, test_list=test_df['inputs'].tolist()) #hidden = train_tok_dataset.map(seqClassifer.extract_hidden_states, # batched=True, # fn_kwargs={'tokenizer': AutoTokenizer.from_pretrained(config.MODEL_OUTPUT_DIR), # 'model': AutoModelForSequenceClassification.from_pretrained(config.MODEL_OUTPUT_DIR)}) pass