EssAI

File size: 9,734 Bytes

ae7fe37

{
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python",
      "version": "3.10.13",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "kaggle": {
      "accelerator": "none",
      "dataSources": [],
      "dockerImageVersionId": 30746,
      "isInternetEnabled": false,
      "language": "python",
      "sourceType": "notebook",
      "isGpuEnabled": false
    }
  },
  "nbformat_minor": 0,
  "nbformat": 4,
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "# --- INSTALLATION ---\n",
        "\n",
        "!pip install pandas numpy matplotlib nltk scikit-learn transformers datasets torch\n",
        "!kaggle datasets download -d shanegerami/ai-vs-human-text\n",
        "!unzip -n ai-vs-human-text.zip\n",
        "!rm ai-vs-human-text.zip\n",
        "\n",
        "# -------------------------"
      ],
      "metadata": {
        "id": "XKWBDF8lir6o",
        "execution": {
          "iopub.status.busy": "2024-08-14T18:13:18.903225Z",
          "iopub.execute_input": "2024-08-14T18:13:18.903635Z",
          "iopub.status.idle": "2024-08-14T18:14:34.119173Z",
          "shell.execute_reply.started": "2024-08-14T18:13:18.903599Z",
          "shell.execute_reply": "2024-08-14T18:14:34.117649Z"
        },
        "trusted": true
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# --- IMPORTS ---\n",
        "\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "import matplotlib.pyplot as plt\n",
        "import re\n",
        "import nltk\n",
        "from nltk.corpus import stopwords\n",
        "nltk.download('stopwords')\n",
        "stopwords = set(stopwords.words('english'))\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n",
        "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n",
        "from transformers import Trainer, TrainingArguments, DataCollatorWithPadding\n",
        "from datasets import Dataset\n",
        "import torch\n",
        "\n",
        "# -------------------------"
      ],
      "metadata": {
        "id": "q9TGKRUIiPMy"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# --- USEFUL FUNCTIONS ----\n",
        "\n",
        "def clean_text(text):\n",
        "    \"\"\"\n",
        "    This funtion get's rid of nonalphabetical characters, stopwords and lower cases the text.\n",
        "\n",
        "    Args:\n",
        "    text (str): The text to be cleaned\n",
        "\n",
        "    Returns:\n",
        "    text (str): The cleaned text\n",
        "\n",
        "    Example:\n",
        "    df['text'] = df['text'].apply(clean_text)\n",
        "    \"\"\"\n",
        "    text = re.sub(r'[^a-zA-Z]', ' ', text)\n",
        "    text = text.lower()\n",
        "    words = text.split()\n",
        "    text = [word for word in words if not word in stopwords]\n",
        "    text = ' '.join(words)\n",
        "    return text\n",
        "\n",
        "def tokenize_function(dataframe):\n",
        "    \"\"\"\n",
        "    This funtion tokenizes the 'text' field of the dataframe.\n",
        "\n",
        "    Args:\n",
        "    dataframe (pandas.DataFrame): The dataframe to be tokenized\n",
        "\n",
        "    Returns:\n",
        "    dataframe (pandas.DataFrame): The tokenized dataframe\n",
        "\n",
        "    Example and output:\n",
        "    train_dataset_token = train_dataset.map(tokenize_function, batched=True)\n",
        "    \"\"\"\n",
        "    return tokenizer(dataframe[\"text\"], truncation=True)\n",
        "\n",
        "def compute_metrics(eval_pred):\n",
        "    \"\"\"\n",
        "    This funtion computes the accuracy, precision, recall and f1 score of the model.\n",
        "\n",
        "    It'is passed to the trainer and it outputs when evaluating the model.\n",
        "\n",
        "    Args:\n",
        "    eval_pred (tuple): The predictions and labels of the model\n",
        "\n",
        "    Returns:\n",
        "    dict: The accuracy, precision, recall and f1 score of the model\n",
        "\n",
        "    Example:\n",
        "    >>> trainer.evaluate()\n",
        "    {\n",
        "        'accuracy': accuracy,\n",
        "        'precision': precision,\n",
        "        'recall': recall,\n",
        "        'f1': f1\n",
        "    }\n",
        "    \"\"\"\n",
        "    predictions, labels = eval_pred\n",
        "    predictions = predictions.argmax(axis=-1)\n",
        "    accuracy = accuracy_score(labels, predictions)\n",
        "    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')\n",
        "    return {\n",
        "        'accuracy': accuracy,\n",
        "        'precision': precision,\n",
        "        'recall': recall,\n",
        "        'f1': f1\n",
        "    }\n",
        "\n",
        "# -------------------------"
      ],
      "metadata": {
        "id": "JtYsc4hJAnk3"
      },
      "execution_count": 41,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# --- LOADING THE MODEL ---\n",
        "\n",
        "# Load the initial tokenizer and model to set the number of labels its going to classify as 2\n",
        "checkpoint = \"diegovelilla/EssAI\"\n",
        "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
        "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n",
        "\n",
        "# -------------------------"
      ],
      "metadata": {
        "id": "P87A1UTgJIia"
      },
      "execution_count": 42,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# --- DATA PREPROCESSING ---\n",
        "\n",
        "df = pd.read_csv('AI_Human.csv')\n",
        "\n",
        "# Separate human from ai\n",
        "df_human = df[df[\"generated\"] == 0]\n",
        "df_ai = df[df[\"generated\"] == 1]\n",
        "\n",
        "# We take as many human written esssays as AI generate since the dataset is a bit unbalanced\n",
        "df_ai_len = df_ai[\"text\"].count()\n",
        "df_human = df_human.sample(n=df_ai_len)\n",
        "\n",
        "# We concatenate both dataframes, shuffle them and then we take 1% of them since those will be enough to fine tune the model\n",
        "# and with my current resources I won't be able to process more. For better results increase the fraction of the data used.\n",
        "df_unshuffled = pd.concat([df_human, df_ai])\n",
        "df = df_unshuffled.sample(frac=0.01).reset_index(drop=True)\n",
        "\n",
        "# Get rid of nonalphatetical characters, stopwords and we lower case it.\n",
        "df['text'] = df['text'].apply(clean_text)\n",
        "\n",
        "# According to the transformers library of hugging face the targets column name should be labels and ints\n",
        "df = df.rename(columns={'generated': 'labels'})\n",
        "df['labels'] = df['labels'].astype(int)\n",
        "\n",
        "# We convert the pandas dataframe into a hugging face dataset and tokenize both of them\n",
        "ds = Dataset.from_pandas(df)\n",
        "ds_token = ds.map(tokenize_function, batched=True)\n",
        "\n",
        "# Drop columns that are not necessary and set the dataset format to pytorch tensors\n",
        "ds_token = ds_token.remove_columns([\"text\", \"token_type_ids\"])\n",
        "ds_token.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])\n",
        "\n",
        "# -------------------------\n"
      ],
      "metadata": {
        "id": "rYaHbUCDG7tf"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# --- INSTANTIATING TRAINER ----\n",
        "\n",
        "# We instantiate a DataCollatorWithPadding in order to pad the inputs in batches while training\n",
        "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
        "\n",
        "# Create the training arguments\n",
        "training_args = TrainingArguments(\"./results\")\n",
        "\n",
        "# Create the trainer\n",
        "trainer = Trainer(\n",
        "    model,\n",
        "    training_args,\n",
        "    eval_dataset=ds_token,\n",
        "    data_collator=data_collator,\n",
        "    tokenizer=tokenizer,\n",
        "    compute_metrics = compute_metrics\n",
        ")\n",
        "\n",
        "# -------------------------"
      ],
      "metadata": {
        "id": "Golh92ee33aA"
      },
      "execution_count": 50,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# --- EVALUATION ---\n",
        "\n",
        "evaluation_results = trainer.evaluate()\n",
        "\n",
        "print(\"Accuracy:\", evaluation_results['eval_accuracy'])\n",
        "print(\"Precision:\", evaluation_results['eval_precision'])\n",
        "print(\"Recall:\", evaluation_results['eval_recall'])\n",
        "print(\"F1:\", evaluation_results['eval_f1'])\n",
        "\n",
        "# -------------------------"
      ],
      "metadata": {
        "id": "WkQgrxgFPkpJ"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}