Spaces:
Runtime error
Runtime error
from datasets import load_dataset | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline, DataCollatorWithPadding | |
from sklearn.metrics import accuracy_score, f1_score | |
import torch | |
import numpy as np | |
import torch.nn.functional as F | |
import matplotlib.pyplot as plt | |
from typing import List | |
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix | |
from umap import UMAP | |
from sklearn.preprocessing import MinMaxScaler | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
class TransformersSequenceClassifier: | |
def __init__(self, | |
model_output_dir, | |
num_labels, | |
tokenizer : AutoTokenizer, | |
id2label, | |
label2id, | |
model_checkpoint="distilbert-base-uncased" | |
): | |
self.model_output_dir = model_output_dir | |
self.tokenizer = tokenizer | |
self.model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels, id2label=id2label, label2id=label2id).to(device) | |
def tokenizer_batch(self, batch): | |
return self.tokenizer(batch["inputs"], truncation=True, padding=True, return_tensors="pt") #, max_len=386 | |
def tokenize_dataset(self, dataset): | |
return dataset.map(self.tokenizer_batch, batched=True, remove_columns=('inputs', '__index_level_0__')) | |
def extract_hidden_states(batch, tokenizer, model): | |
# Place model inputs on the GPU | |
inputs = {k:v for k,v in batch.items() if k in tokenizer.model_input_names} #.to(device) | |
# Extract last hidden states | |
with torch.no_grad(): | |
last_hidden_state = model(**inputs).last_hidden_state | |
# Return vector for [CLS] token | |
return {"hidden_state": last_hidden_state[:,0].cpu().numpy()} | |
def fit_umap(df_x): | |
# Scale features to [0,1] range | |
X_scaled = MinMaxScaler().fit_transform(df_x) | |
# Initialize and fit UMAP | |
mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled) | |
return mapper.embedding_ | |
# Create a DataFrame of 2D embeddings | |
def train(self, train_dataset, eval_dataset, batch_size, epochs): | |
#data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer, padding='longest') | |
training_args = TrainingArguments(output_dir=self.model_output_dir, | |
num_train_epochs=epochs, | |
learning_rate=2e-5, | |
per_device_train_batch_size=batch_size, | |
per_device_eval_batch_size=batch_size, | |
weight_decay=0.01, | |
evaluation_strategy="epoch", | |
save_strategy='epoch', | |
disable_tqdm=False, | |
logging_steps=len(train_dataset)//batch_size, | |
push_to_hub=True, | |
load_best_model_at_end=True, | |
log_level="error") | |
self.trainer = Trainer( | |
model=self.model, | |
args=training_args, | |
compute_metrics=self._compute_metrics, | |
train_dataset=train_dataset, | |
eval_dataset=eval_dataset, | |
tokenizer=self.tokenizer, | |
#data_collator=data_collator | |
) | |
self.trainer.train() | |
self.trainer.push_to_hub(commit_message="Training completed!") | |
def _compute_metrics(pred): | |
labels = pred.label_ids | |
preds = pred.predictions.argmax(-1) | |
f1 = f1_score(labels, preds, average="weighted") | |
acc = accuracy_score(labels, preds) | |
return {"accuracy": acc, "f1": f1} | |
def forward_pass_with_label(self, batch): | |
# Place all input tensors on the same device as the model | |
inputs = {k:v.to(device) for k,v in batch.items() | |
if k in self.tokenizer.model_input_names} | |
with torch.no_grad(): | |
output = self.model(**inputs) | |
pred_label = torch.argmax(output.logits, axis=-1) | |
loss = F.cross_entropy(output.logits, batch["label"].to(device), | |
reduction="none") | |
# Place outputs on CPU for compatibility with other dataset columns | |
return {"loss": loss.cpu().numpy(), | |
"predicted_label": pred_label.cpu().numpy()} | |
def compute_loss_per_pred(self, valid_dataset): | |
# Compute loss values | |
return valid_dataset.map(self.forward_pass_with_label, batched=True, batch_size=16) | |
def plot_confusion_matrix(y_preds, y_true, label_names): | |
cm = confusion_matrix(y_true, y_preds, normalize="true") | |
fig, ax = plt.subplots(figsize=(6, 6)) | |
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names) | |
disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False) | |
plt.title("Normalized confusion matrix") | |
plt.show() | |
def predict_argmax_logit(self, valid_dataset): | |
#trainer = Trainer(model=self.model) | |
preds_output = self.trainer.predict(valid_dataset) | |
print(preds_output.metrics) | |
y_preds = np.argmax(preds_output.predictions, axis=1) | |
return y_preds | |
def predict_pipeline(model_checkpoint, test_list: List[str]) -> List: | |
pipe_classifier = pipeline("text-classification", model=model_checkpoint) | |
preds = pipe_classifier(test_list, return_all_scores=True) | |
return preds | |