In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [3]:
import torch
from datasets import load_dataset, concatenate_datasets, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split

In [4]:
dataset1 = load_dataset("ucirvine/sms_spam", split="train")
dataset2 = load_dataset("AbdulHadi806/mail_spam_ham_dataset", split="train")
dataset3 = load_dataset("Goodmotion/spam-mail", split="train")

README.md:   0%|          | 0.00/4.98k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/359k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5574 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/226 [00:00<?, ?B/s]

mail_data.csv:   0%|          | 0.00/483k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5613 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/121 [00:00<?, ?B/s]

dataset.csv:   0%|          | 0.00/241k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6018 [00:00<?, ? examples/s]

In [5]:
def get_text_column(example):
    for col in ["text", "sms", "Message"]:  # Check possible names
        if col in example:
            return col
    return None 

In [6]:
def get_label_column(example):
    for col in ["label", "Category"]:  # Check possible names
        if col in example:
            return col
    return None 

In [7]:
def clean_label(label):
    if isinstance(label, str):
        label = re.sub(r"[^a-zA-Z]", "", label)  # Remove numbers & special chars
        return 1 if label.lower() == "spam" else 0  # Convert to numeric labels
    return int(label)

In [8]:
def preprocess(example):
    # Standardize text column
    text_col = "text" if "text" in example else "sms" if "sms" in example else "Message"
    label_col = "label" if "label" in example else "Category"
    
    # Standardize label format
    label_mapping = {"ham": 0, "spam": 1}  # Convert text labels
    label = example[label_col]
    
    if isinstance(label, str):  
        label = label_mapping.get(label.lower(), 0)  # Convert to int64
    elif isinstance(label, int):  
        label = int(label)  # Ensure it's an integer
    
    return {"text": example[text_col], "label": label}

In [9]:
dataset1 = dataset1.map(preprocess)
dataset2 = dataset2.map(preprocess)
dataset3 = dataset3.map(preprocess)

Map:   0%|          | 0/5574 [00:00<?, ? examples/s]

Map:   0%|          | 0/5613 [00:00<?, ? examples/s]

Map:   0%|          | 0/6018 [00:00<?, ? examples/s]

In [10]:
from datasets import Value

In [11]:
dataset1 = dataset1.cast_column("label", Value("int64"))
dataset2 = dataset2.cast_column("label", Value("int64"))
dataset3 = dataset3.cast_column("label", Value("int64"))

Casting the dataset:   0%|          | 0/5574 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/5613 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6018 [00:00<?, ? examples/s]

In [12]:
dataset1 = dataset1.remove_columns([col for col in dataset1.column_names if col not in ["text", "label"]])
dataset2 = dataset2.remove_columns([col for col in dataset2.column_names if col not in ["text", "label"]])
dataset3 = dataset3.remove_columns([col for col in dataset3.column_names if col not in ["text", "label"]])

In [13]:
merged_dataset = concatenate_datasets([dataset1, dataset2, dataset3])

In [14]:
df = merged_dataset.to_pandas()

In [15]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42
)

In [16]:
train_data = Dataset.from_dict({"text": train_texts.tolist(), "label": train_labels.tolist()})
test_data = Dataset.from_dict({"text": test_texts.tolist(), "label": test_labels.tolist()})

In [17]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [18]:
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=512)

In [19]:
train_data = train_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

Map:   0%|          | 0/13764 [00:00<?, ? examples/s]

Map:   0%|          | 0/3441 [00:00<?, ? examples/s]

In [20]:
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [21]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [22]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2).to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

In [24]:
training_args = TrainingArguments(
    output_dir="./roberta_spam",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [26]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0883,0.046818,0.991282,0.971609,0.996764,0.984026
2,0.025,0.011146,0.997675,0.996757,0.994606,0.99568
3,0.0028,0.016367,0.998256,1.0,0.993528,0.996753




TrainOutput(global_step=2583, training_loss=0.02921043153227737, metrics={'train_runtime': 2493.5423, 'train_samples_per_second': 16.56, 'train_steps_per_second': 1.036, 'total_flos': 1.086438169792512e+16, 'train_loss': 0.02921043153227737, 'epoch': 3.0})

In [27]:
model.save_pretrained('fine-tuned-model')
tokenizer.save_pretrained('fine-tuned-model')

('fine-tuned-model/tokenizer_config.json',
 'fine-tuned-model/special_tokens_map.json',
 'fine-tuned-model/vocab.json',
 'fine-tuned-model/merges.txt',
 'fine-tuned-model/added_tokens.json')

In [39]:
model_name = '/kaggle/working/quantized-model'
model = RobertaForSequenceClassification.from_pretrained(model_name).to(device)
tokenizer = RobertaTokenizer.from_pretrained(model_name)

In [40]:
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    # Move input tensors to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits).item()
    
    return "Spam" if predicted_class == 1 else "Ham"

In [41]:
input_text = "Congratulations! You have won a free iPhone. Click here to claim your prize."
print(f"Prediction: {predict(input_text)}")  # Expected output: Spam

Prediction: Spam


In [37]:
quantized_model = model.to(dtype=torch.float16, device=device)

In [38]:
quantized_model.save_pretrained('quantized-model')
tokenizer.save_pretrained('quantized-model')

('quantized-model/tokenizer_config.json',
 'quantized-model/special_tokens_map.json',
 'quantized-model/vocab.json',
 'quantized-model/merges.txt',
 'quantized-model/added_tokens.json')