{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "12349750", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Label': ['ham', 'ham', 'ham'],\n", " 'Sentence': ['Are you up for the challenge? I know i am :)',\n", " 'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit & Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning & Have A Happy Day:)',\n", " 'Kallis is ready for bat in 2nd innings']}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from datasets import load_dataset\n", "\n", "data_files =\"E:/Hugging_Face/SMS_Spam.csv\"\n", "spam_data = load_dataset(\"csv\", data_files = data_files, split = \"train\")\n", "spam_data = spam_data.train_test_split(test_size = 0.2)\n", "spam_data[\"train\"][:3]" ] }, { "cell_type": "code", "execution_count": 2, "id": "35f0392d", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e6740059d6df4ea7aceaf262ef339c94", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/4459 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "c2d8fd5629eb4e6aa0c91866c3ee2562", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/1115 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['Label', 'Sentence'],\n", " num_rows: 4459\n", " })\n", " test: Dataset({\n", " features: ['Label', 'Sentence'],\n", " num_rows: 1115\n", " })\n", "})" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def lower_case(example):\n", " return {\"Sentence\": example[\"Sentence\"].lower()}\n", "\n", "spam_data.map(lower_case)" ] }, { "cell_type": "code", "execution_count": 3, "id": "9df36294", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1d4f5f516b024a459dba03cb2b5e764b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/4459 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a64af11c2cde4ef1b56a49b4ffb6b200", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/1115 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def sen_len(example):\n", " return {\"length\": len(example[\"Sentence\"].split())}\n", "\n", "spam_data = spam_data.map(sen_len)" ] }, { "cell_type": "code", "execution_count": 4, "id": "db1d8406", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Label': ['ham', 'ham', 'ham'],\n", " 'Sentence': ['Are you up for the challenge? I know i am :)',\n", " 'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit & Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning & Have A Happy Day:)',\n", " 'Kallis is ready for bat in 2nd innings'],\n", " 'length': [11, 29, 8]}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spam_data[\"train\"][:3]" ] }, { "cell_type": "code", "execution_count": 8, "id": "3e742939", "metadata": {}, "outputs": [], "source": [ "spam_data = spam_data.rename_column(\"Label\", \"labels\")" ] }, { "cell_type": "code", "execution_count": 9, "id": "a1d7c214", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ae1b7a15bd7e46e5aa763483d877ac5b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/4459 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e33b493b97754c588a4847d069773fc3", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/1115 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import html\n", "\n", "spam_data = spam_data.map(lambda x: {\"Sentence\": html.unescape(x[\"Sentence\"])}, batched = True)" ] }, { "cell_type": "code", "execution_count": 10, "id": "8fa3f455", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'labels': ['ham',\n", " 'ham',\n", " 'ham',\n", " 'ham',\n", " 'ham',\n", " 'ham',\n", " 'ham',\n", " 'ham',\n", " 'ham',\n", " 'ham',\n", " 'ham',\n", " 'ham',\n", " 'spam',\n", " 'ham',\n", " 'ham',\n", " 'ham',\n", " 'ham',\n", " 'spam',\n", " 'ham',\n", " 'ham'],\n", " 'Sentence': ['Are you up for the challenge? I know i am :)',\n", " 'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit & Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning & Have A Happy Day:)',\n", " 'Kallis is ready for bat in 2nd innings',\n", " 'Gud mrng dear hav a nice day',\n", " 'I not free today i haf 2 pick my parents up tonite...',\n", " 'Good afternoon on this glorious anniversary day, my sweet J !! I hope this finds you happy and content, my Prey. I think of you and send a teasing kiss from across the sea coaxing images of fond souveniers ... You Cougar-Pen',\n", " 'SERIOUSLY. TELL HER THOSE EXACT WORDS RIGHT NOW.',\n", " 'Haha awesome, I might need to take you up on that, what you doin tonight?',\n", " 'Ok...',\n", " 'I am sorry it hurt you.',\n", " 'Watching cartoon, listening music & at eve had to go temple & church.. What about u?',\n", " 'Sent me de webadres for geting salary slip',\n", " 'Double mins and txts 4 6months FREE Bluetooth on Orange. Available on Sony, Nokia Motorola phones. Call MobileUpd8 on 08000839402 or call2optout/N9DX',\n", " \"I want snow. It's just freezing and windy.\",\n", " ', im .. On the snowboarding trip. I was wondering if your planning to get everyone together befor we go..a meet and greet kind of affair? Cheers, ',\n", " 'Siva is in hostel aha:-.',\n", " 'CHEERS LOU! YEAH WAS A GOODNITE SHAME U NEVA CAME! C YA GAILxx',\n", " 'URGENT! Your Mobile number has been awarded with a £2000 prize GUARANTEED. Call 09061790126 from land line. Claim 3030. Valid 12hrs only 150ppm',\n", " 'Did u got that persons story',\n", " 'Amazing : If you rearrange these letters it gives the same meaning... Dormitory = Dirty room Astronomer = Moon starer The eyes = They see Election results = Lies lets recount Mother-in-law = Woman Hitler Eleven plus two =Twelve plus one Its Amazing... !:-)'],\n", " 'length': [11,\n", " 29,\n", " 8,\n", " 7,\n", " 12,\n", " 42,\n", " 8,\n", " 15,\n", " 1,\n", " 6,\n", " 16,\n", " 8,\n", " 22,\n", " 8,\n", " 27,\n", " 5,\n", " 13,\n", " 23,\n", " 6,\n", " 44]}" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spam_data[\"train\"][:20]" ] }, { "cell_type": "code", "execution_count": 13, "id": "b59be7ac", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "54748e89b52c45f5af2c1d96e6e6f91e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Casting the dataset: 0%| | 0/4459 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "bae9e8f5c4c84a19aa01e4bb2d65080e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Casting the dataset: 0%| | 0/1115 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "{'labels': ClassLabel(names=['ham', 'spam']), 'Sentence': Value('string'), 'length': Value('int64')}\n" ] } ], "source": [ "from datasets import load_dataset, ClassLabel\n", "\n", "spam_data = spam_data.cast_column(\n", " \"labels\", ClassLabel(names=[\"ham\", \"spam\"])\n", ")\n", "\n", "print(spam_data[\"train\"].features)\n" ] }, { "cell_type": "code", "execution_count": 14, "id": "b8a087d1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'labels': [0, 0, 0],\n", " 'Sentence': ['Are you up for the challenge? I know i am :)',\n", " 'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit & Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning & Have A Happy Day:)',\n", " 'Kallis is ready for bat in 2nd innings'],\n", " 'length': [11, 29, 8]}" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spam_data[\"train\"][:3]" ] }, { "cell_type": "code", "execution_count": 15, "id": "eae6b9a7", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "58ddfaa8aa3545879d58d0a955b886e4", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/4459 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1ba22c48e19c4d53b37e54615139925e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/1115 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "{'labels': 0,\n", " 'Sentence': 'Are you up for the challenge? I know i am :)',\n", " 'length': 11,\n", " 'input_ids': [101,\n", " 2372,\n", " 1128,\n", " 1146,\n", " 1111,\n", " 1103,\n", " 4506,\n", " 136,\n", " 146,\n", " 1221,\n", " 178,\n", " 1821,\n", " 131,\n", " 114,\n", " 102],\n", " 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import AutoTokenizer, AutoModel\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n", "\n", "def tokenize_function(example):\n", " return tokenizer(example[\"Sentence\"], truncation = True)\n", "\n", "tokenized_dataset = spam_data.map(tokenize_function, batched = True)\n", "\n", "tokenized_dataset[\"train\"][0]" ] }, { "cell_type": "code", "execution_count": 16, "id": "f04dabd4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n", " num_rows: 4459\n", " })\n", " test: Dataset({\n", " features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n", " num_rows: 1115\n", " })\n", "})" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenized_dataset" ] }, { "cell_type": "code", "execution_count": 17, "id": "73f820b8", "metadata": {}, "outputs": [], "source": [ "spam_data_clean = tokenized_dataset[\"train\"].train_test_split(train_size = 0.8, seed = 42)\n", "\n", "spam_data_clean[\"validation\"] = spam_data_clean.pop(\"test\")\n", "\n", "spam_data_clean[\"test\"] = tokenized_dataset[\"test\"]" ] }, { "cell_type": "code", "execution_count": 18, "id": "70c743a6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n", " num_rows: 3567\n", " })\n", " validation: Dataset({\n", " features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n", " num_rows: 892\n", " })\n", " test: Dataset({\n", " features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n", " num_rows: 1115\n", " })\n", "})" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spam_data_clean" ] }, { "cell_type": "code", "execution_count": 19, "id": "58ce2ac8", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1e8d1e81615c4bda9e8c9d38e102618e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Saving the dataset (0/1 shards): 0%| | 0/3567 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "22d34c3e8185484eb5f690b926cc561e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Saving the dataset (0/1 shards): 0%| | 0/892 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "08305f0b9791416fb2053582f7da8e44", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Saving the dataset (0/1 shards): 0%| | 0/1115 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "spam_data_clean.save_to_disk(\"Spam-Ham-Classification\")" ] }, { "cell_type": "code", "execution_count": 20, "id": "14052e09", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'labels': [0, 0, 0],\n", " 'Sentence': ['What your plan for pongal?',\n", " \"alright, I'll make sure the car is back tonight\",\n", " 'Multiply the numbers independently and count decimal points then, for the division, push the decimal places like i showed you.'],\n", " 'length': [5, 9, 20],\n", " 'input_ids': [[101, 1327, 1240, 2197, 1111, 185, 4553, 1348, 136, 102],\n", " [101,\n", " 15354,\n", " 117,\n", " 146,\n", " 112,\n", " 1325,\n", " 1294,\n", " 1612,\n", " 1103,\n", " 1610,\n", " 1110,\n", " 1171,\n", " 3568,\n", " 102],\n", " [101,\n", " 18447,\n", " 1643,\n", " 1193,\n", " 1103,\n", " 2849,\n", " 8942,\n", " 1105,\n", " 5099,\n", " 1260,\n", " 27924,\n", " 1827,\n", " 1173,\n", " 117,\n", " 1111,\n", " 1103,\n", " 2417,\n", " 117,\n", " 4684,\n", " 1103,\n", " 1260,\n", " 27924,\n", " 2844,\n", " 1176,\n", " 178,\n", " 2799,\n", " 1128,\n", " 119,\n", " 102]],\n", " 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0,\n", " 0]],\n", " 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", " [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n", " [1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1,\n", " 1]]}" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spam_data_clean[\"validation\"][:3]" ] }, { "cell_type": "code", "execution_count": 21, "id": "0f97ef10", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n", " num_rows: 3567\n", " })\n", " validation: Dataset({\n", " features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n", " num_rows: 892\n", " })\n", " test: Dataset({\n", " features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n", " num_rows: 1115\n", " })\n", "})" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "spam_data_clean.remove_columns([\"Sentence\",\"length\"])" ] }, { "cell_type": "code", "execution_count": 22, "id": "06c933a6", "metadata": {}, "outputs": [], "source": [ "data_files = {\"train\": spam_data_clean[\"train\"], \"validation\": spam_data_clean[\"validation\"], \"test\": spam_data_clean[\"test\"]}" ] }, { "cell_type": "code", "execution_count": 35, "id": "3959be63", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "from transformers import AutoModelForSequenceClassification, TrainingArguments\n", "\n", "training_args = TrainingArguments(\"test-trainer\",\n", " eval_strategy = \"epoch\",\n", " fp16 = True,\n", " #gradient_accumulation_steps = 4,\n", " #per_device_train_batch_size = 4,\n", " learning_rate= 1e-5,\n", " lr_scheduler_type = \"cosine\",)\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-cased\", num_labels = 2)" ] }, { "cell_type": "code", "execution_count": 36, "id": "bd40266e", "metadata": {}, "outputs": [], "source": [ "from transformers import DataCollatorWithPadding\n", "data_collator = DataCollatorWithPadding(tokenizer = tokenizer)" ] }, { "cell_type": "code", "execution_count": 37, "id": "3bbc3fd2", "metadata": {}, "outputs": [], "source": [ "import evaluate, numpy as np\n", "metric = evaluate.combine([\"accuracy\", \"f1\", \"precision\", \"recall\"])\n", "\n", "def compute_metrics(eval_preds):\n", " logits, labels = eval_preds\n", " preds = np.argmax(logits, axis=-1)\n", " return metric.compute(predictions=preds, references=labels)" ] }, { "cell_type": "code", "execution_count": 38, "id": "e46ffe8e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "
Epoch | \n", "Training Loss | \n", "Validation Loss | \n", "Accuracy | \n", "F1 | \n", "Precision | \n", "Recall | \n", "
---|---|---|---|---|---|---|
1 | \n", "No log | \n", "0.045297 | \n", "0.989910 | \n", "0.962963 | \n", "0.983193 | \n", "0.943548 | \n", "
2 | \n", "0.095300 | \n", "0.042776 | \n", "0.993274 | \n", "0.975207 | \n", "1.000000 | \n", "0.951613 | \n", "
3 | \n", "0.021200 | \n", "0.040522 | \n", "0.993274 | \n", "0.975207 | \n", "1.000000 | \n", "0.951613 | \n", "
"
],
"text/plain": [
"