{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "12349750",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Label': ['ham', 'ham', 'ham'],\n",
       " 'Sentence': ['Are you up for the challenge? I know i am :)',\n",
       "  'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit &amp; Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning &amp; Have A Happy Day:)',\n",
       "  'Kallis is ready for bat in 2nd innings']}"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "data_files =\"E:/Hugging_Face/SMS_Spam.csv\"\n",
    "spam_data = load_dataset(\"csv\", data_files = data_files, split = \"train\")\n",
    "spam_data = spam_data.train_test_split(test_size = 0.2)\n",
    "spam_data[\"train\"][:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "35f0392d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e6740059d6df4ea7aceaf262ef339c94",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/4459 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c2d8fd5629eb4e6aa0c91866c3ee2562",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/1115 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['Label', 'Sentence'],\n",
       "        num_rows: 4459\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['Label', 'Sentence'],\n",
       "        num_rows: 1115\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def lower_case(example):\n",
    "    return {\"Sentence\": example[\"Sentence\"].lower()}\n",
    "\n",
    "spam_data.map(lower_case)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "9df36294",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1d4f5f516b024a459dba03cb2b5e764b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/4459 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a64af11c2cde4ef1b56a49b4ffb6b200",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/1115 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "def sen_len(example):\n",
    "    return {\"length\": len(example[\"Sentence\"].split())}\n",
    "\n",
    "spam_data = spam_data.map(sen_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "db1d8406",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Label': ['ham', 'ham', 'ham'],\n",
       " 'Sentence': ['Are you up for the challenge? I know i am :)',\n",
       "  'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit &amp; Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning &amp; Have A Happy Day:)',\n",
       "  'Kallis is ready for bat in 2nd innings'],\n",
       " 'length': [11, 29, 8]}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spam_data[\"train\"][:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "3e742939",
   "metadata": {},
   "outputs": [],
   "source": [
    "spam_data = spam_data.rename_column(\"Label\", \"labels\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "a1d7c214",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ae1b7a15bd7e46e5aa763483d877ac5b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/4459 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e33b493b97754c588a4847d069773fc3",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/1115 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import html\n",
    "\n",
    "spam_data = spam_data.map(lambda x: {\"Sentence\": html.unescape(x[\"Sentence\"])}, batched = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "8fa3f455",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'labels': ['ham',\n",
       "  'ham',\n",
       "  'ham',\n",
       "  'ham',\n",
       "  'ham',\n",
       "  'ham',\n",
       "  'ham',\n",
       "  'ham',\n",
       "  'ham',\n",
       "  'ham',\n",
       "  'ham',\n",
       "  'ham',\n",
       "  'spam',\n",
       "  'ham',\n",
       "  'ham',\n",
       "  'ham',\n",
       "  'ham',\n",
       "  'spam',\n",
       "  'ham',\n",
       "  'ham'],\n",
       " 'Sentence': ['Are you up for the challenge? I know i am :)',\n",
       "  'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit &amp; Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning &amp; Have A Happy Day:)',\n",
       "  'Kallis is ready for bat in 2nd innings',\n",
       "  'Gud mrng dear hav a nice day',\n",
       "  'I not free today i haf 2 pick my parents up tonite...',\n",
       "  'Good afternoon on this glorious anniversary day, my sweet J !! I hope this finds you happy and content, my Prey. I think of you and send a teasing kiss from across the sea coaxing images of fond souveniers ... You Cougar-Pen',\n",
       "  'SERIOUSLY. TELL HER THOSE EXACT WORDS RIGHT NOW.',\n",
       "  'Haha awesome, I might need to take you up on that, what you doin tonight?',\n",
       "  'Ok...',\n",
       "  'I am sorry it hurt you.',\n",
       "  'Watching cartoon, listening music &amp; at eve had to go temple &amp; church.. What about u?',\n",
       "  'Sent me de webadres for geting salary slip',\n",
       "  'Double mins and txts 4 6months FREE Bluetooth on Orange. Available on Sony, Nokia Motorola phones. Call MobileUpd8 on 08000839402 or call2optout/N9DX',\n",
       "  \"I want snow. It's just freezing and windy.\",\n",
       "  ', im .. On the snowboarding trip. I was wondering if your planning to get everyone together befor we go..a meet and greet kind of affair? Cheers, ',\n",
       "  'Siva is in hostel aha:-.',\n",
       "  'CHEERS LOU! YEAH WAS A GOODNITE SHAME U NEVA CAME! C YA GAILxx',\n",
       "  'URGENT! Your Mobile number has been awarded with a Â£2000 prize GUARANTEED. Call 09061790126 from land line. Claim 3030. Valid 12hrs only 150ppm',\n",
       "  'Did u got that persons story',\n",
       "  'Amazing : If you rearrange these letters it gives the same meaning... Dormitory = Dirty room Astronomer = Moon starer The eyes = They see Election results = Lies lets recount Mother-in-law = Woman Hitler Eleven plus two =Twelve plus one Its Amazing... !:-)'],\n",
       " 'length': [11,\n",
       "  29,\n",
       "  8,\n",
       "  7,\n",
       "  12,\n",
       "  42,\n",
       "  8,\n",
       "  15,\n",
       "  1,\n",
       "  6,\n",
       "  16,\n",
       "  8,\n",
       "  22,\n",
       "  8,\n",
       "  27,\n",
       "  5,\n",
       "  13,\n",
       "  23,\n",
       "  6,\n",
       "  44]}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spam_data[\"train\"][:20]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "b59be7ac",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "54748e89b52c45f5af2c1d96e6e6f91e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Casting the dataset:   0%|          | 0/4459 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "bae9e8f5c4c84a19aa01e4bb2d65080e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Casting the dataset:   0%|          | 0/1115 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'labels': ClassLabel(names=['ham', 'spam']), 'Sentence': Value('string'), 'length': Value('int64')}\n"
     ]
    }
   ],
   "source": [
    "from datasets import load_dataset, ClassLabel\n",
    "\n",
    "spam_data = spam_data.cast_column(\n",
    "    \"labels\", ClassLabel(names=[\"ham\", \"spam\"])\n",
    ")\n",
    "\n",
    "print(spam_data[\"train\"].features)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "b8a087d1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'labels': [0, 0, 0],\n",
       " 'Sentence': ['Are you up for the challenge? I know i am :)',\n",
       "  'Feel Yourself That You Are Always Happy.. Slowly It Becomes Your Habit &amp; Finally It Becomes Part Of Your Life.. Follow It.. Happy Morning &amp; Have A Happy Day:)',\n",
       "  'Kallis is ready for bat in 2nd innings'],\n",
       " 'length': [11, 29, 8]}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spam_data[\"train\"][:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "eae6b9a7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "58ddfaa8aa3545879d58d0a955b886e4",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/4459 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1ba22c48e19c4d53b37e54615139925e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Map:   0%|          | 0/1115 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'labels': 0,\n",
       " 'Sentence': 'Are you up for the challenge? I know i am :)',\n",
       " 'length': 11,\n",
       " 'input_ids': [101,\n",
       "  2372,\n",
       "  1128,\n",
       "  1146,\n",
       "  1111,\n",
       "  1103,\n",
       "  4506,\n",
       "  136,\n",
       "  146,\n",
       "  1221,\n",
       "  178,\n",
       "  1821,\n",
       "  131,\n",
       "  114,\n",
       "  102],\n",
       " 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
       " 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import AutoTokenizer, AutoModel\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-cased\")\n",
    "\n",
    "def tokenize_function(example):\n",
    "    return tokenizer(example[\"Sentence\"], truncation = True)\n",
    "\n",
    "tokenized_dataset = spam_data.map(tokenize_function, batched = True)\n",
    "\n",
    "tokenized_dataset[\"train\"][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "f04dabd4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 4459\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 1115\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenized_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "73f820b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "spam_data_clean = tokenized_dataset[\"train\"].train_test_split(train_size = 0.8, seed = 42)\n",
    "\n",
    "spam_data_clean[\"validation\"] = spam_data_clean.pop(\"test\")\n",
    "\n",
    "spam_data_clean[\"test\"] = tokenized_dataset[\"test\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "70c743a6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 3567\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 892\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['labels', 'Sentence', 'length', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 1115\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spam_data_clean"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "58ce2ac8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1e8d1e81615c4bda9e8c9d38e102618e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/3567 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "22d34c3e8185484eb5f690b926cc561e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/892 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "08305f0b9791416fb2053582f7da8e44",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/1115 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "spam_data_clean.save_to_disk(\"Spam-Ham-Classification\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "14052e09",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'labels': [0, 0, 0],\n",
       " 'Sentence': ['What your plan for pongal?',\n",
       "  \"alright, I'll make sure the car is back tonight\",\n",
       "  'Multiply the numbers independently and count decimal points then, for the division, push the decimal places like i showed you.'],\n",
       " 'length': [5, 9, 20],\n",
       " 'input_ids': [[101, 1327, 1240, 2197, 1111, 185, 4553, 1348, 136, 102],\n",
       "  [101,\n",
       "   15354,\n",
       "   117,\n",
       "   146,\n",
       "   112,\n",
       "   1325,\n",
       "   1294,\n",
       "   1612,\n",
       "   1103,\n",
       "   1610,\n",
       "   1110,\n",
       "   1171,\n",
       "   3568,\n",
       "   102],\n",
       "  [101,\n",
       "   18447,\n",
       "   1643,\n",
       "   1193,\n",
       "   1103,\n",
       "   2849,\n",
       "   8942,\n",
       "   1105,\n",
       "   5099,\n",
       "   1260,\n",
       "   27924,\n",
       "   1827,\n",
       "   1173,\n",
       "   117,\n",
       "   1111,\n",
       "   1103,\n",
       "   2417,\n",
       "   117,\n",
       "   4684,\n",
       "   1103,\n",
       "   1260,\n",
       "   27924,\n",
       "   2844,\n",
       "   1176,\n",
       "   178,\n",
       "   2799,\n",
       "   1128,\n",
       "   119,\n",
       "   102]],\n",
       " 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
       "  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
       "  [0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0,\n",
       "   0]],\n",
       " 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
       "  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
       "  [1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1,\n",
       "   1]]}"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spam_data_clean[\"validation\"][:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "0f97ef10",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 3567\n",
       "    })\n",
       "    validation: Dataset({\n",
       "        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 892\n",
       "    })\n",
       "    test: Dataset({\n",
       "        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],\n",
       "        num_rows: 1115\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "spam_data_clean.remove_columns([\"Sentence\",\"length\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "06c933a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_files = {\"train\": spam_data_clean[\"train\"], \"validation\": spam_data_clean[\"validation\"], \"test\": spam_data_clean[\"test\"]}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "3959be63",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoModelForSequenceClassification, TrainingArguments\n",
    "\n",
    "training_args = TrainingArguments(\"test-trainer\",\n",
    "                                 eval_strategy = \"epoch\",\n",
    "                                 fp16 = True,\n",
    "                                 #gradient_accumulation_steps = 4,\n",
    "                                 #per_device_train_batch_size = 4,\n",
    "                                 learning_rate= 1e-5,\n",
    "                                 lr_scheduler_type = \"cosine\",)\n",
    "\n",
    "model = AutoModelForSequenceClassification.from_pretrained(\"bert-base-cased\", num_labels = 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "bd40266e",
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import DataCollatorWithPadding\n",
    "data_collator = DataCollatorWithPadding(tokenizer = tokenizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "3bbc3fd2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import evaluate, numpy as np\n",
    "metric = evaluate.combine([\"accuracy\", \"f1\", \"precision\", \"recall\"])\n",
    "\n",
    "def compute_metrics(eval_preds):\n",
    "    logits, labels = eval_preds\n",
    "    preds = np.argmax(logits, axis=-1)\n",
    "    return metric.compute(predictions=preds, references=labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "e46ffe8e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='1338' max='1338' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [1338/1338 02:15, Epoch 3/3]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "      <th>Accuracy</th>\n",
       "      <th>F1</th>\n",
       "      <th>Precision</th>\n",
       "      <th>Recall</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>No log</td>\n",
       "      <td>0.045297</td>\n",
       "      <td>0.989910</td>\n",
       "      <td>0.962963</td>\n",
       "      <td>0.983193</td>\n",
       "      <td>0.943548</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.095300</td>\n",
       "      <td>0.042776</td>\n",
       "      <td>0.993274</td>\n",
       "      <td>0.975207</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.951613</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.021200</td>\n",
       "      <td>0.040522</td>\n",
       "      <td>0.993274</td>\n",
       "      <td>0.975207</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.951613</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=1338, training_loss=0.04511010432991746, metrics={'train_runtime': 136.1512, 'train_samples_per_second': 78.596, 'train_steps_per_second': 9.827, 'total_flos': 338812011541800.0, 'train_loss': 0.04511010432991746, 'epoch': 3.0})"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from transformers import Trainer\n",
    "\n",
    "trainer = Trainer(model,\n",
    "                  training_args,\n",
    "                  train_dataset = spam_data_clean[\"train\"],\n",
    "                  eval_dataset = spam_data_clean[\"validation\"],\n",
    "                  data_collator = data_collator,\n",
    "                  processing_class = tokenizer,\n",
    "                 compute_metrics=compute_metrics,)\n",
    "\n",
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "c236f093",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='112' max='112' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [112/112 00:04]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'eval_loss': 0.04052222892642021,\n",
       " 'eval_accuracy': 0.9932735426008968,\n",
       " 'eval_f1': 0.9752066115702479,\n",
       " 'eval_precision': 1.0,\n",
       " 'eval_recall': 0.9516129032258065,\n",
       " 'eval_runtime': 5.1761,\n",
       " 'eval_samples_per_second': 172.33,\n",
       " 'eval_steps_per_second': 21.638,\n",
       " 'epoch': 3.0}"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.evaluate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "1e6538eb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('spam-classifier\\\\tokenizer_config.json',\n",
       " 'spam-classifier\\\\special_tokens_map.json',\n",
       " 'spam-classifier\\\\vocab.txt',\n",
       " 'spam-classifier\\\\added_tokens.json',\n",
       " 'spam-classifier\\\\tokenizer.json')"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.save_model(\"spam-ham-classification\")\n",
    "tokenizer.save_pretrained(\"spam-classifier\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "99dbfb57",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}