Spaces:
Sleeping
Sleeping
File size: 111,353 Bytes
0403b6d |
|
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Fake News Detection: Baseline vs. BERT\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# (1) Install dependencies (run once at the top of the notebook)\n",
"!pip install -q transformers datasets scikit-learn torch matplotlib seaborn\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using device: cpu\n"
]
}
],
"source": [
"# (2) Imports and Device Setup\n",
"import os\n",
"import pandas as pd\n",
"import torch\n",
"from datasets import load_dataset\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.metrics import (\n",
" accuracy_score, \n",
" f1_score, \n",
" classification_report, \n",
" confusion_matrix, \n",
" roc_curve, \n",
" auc\n",
")\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from torch.utils.data import DataLoader\n",
"from transformers import AutoTokenizer, AutoModelForSequenceClassification\n",
"\n",
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
"print(\"Using device:\", device)\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "4d58cf7fc6014fffa761569ffe0a9923",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading data: 0%| | 0.00/1.01M [00:00<?, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2388c0d0487c4dd0821bc7628d3ca4c2",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating train split: 0%| | 0/10269 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "2a171d7e84dc42b88bccd30a8a30d691",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating test split: 0%| | 0/1283 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "12e35705877e4d01a92aa217fceb2f9a",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Generating validation split: 0%| | 0/1284 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"▶︎ Train size: 8215 Test size: 2054\n",
"▶︎ Label counts in train: {1: 4869, 0: 3346}\n",
"▶︎ Label counts in test: {1: 1218, 0: 836}\n"
]
}
],
"source": [
"# (3) Load & Binarize the LIAR Dataset\n",
"# We’ll use only the “train” split and perform an 80/20 train/test split ourselves.\n",
"\n",
"raw = load_dataset(\"liar\", cache_dir=\"/content/hf_cache\")\n",
"df_all = pd.DataFrame(raw[\"train\"])\n",
"\n",
"# Map label IDs 0,1,2 → fake (1); 3,4,5 → real (0)\n",
"def binarize(label_id):\n",
" return int(label_id <= 2)\n",
"\n",
"df_all[\"label_bin\"] = df_all[\"label\"].map(binarize)\n",
"\n",
"# 80/20 stratified split on label_bin\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" df_all[\"statement\"],\n",
" df_all[\"label_bin\"],\n",
" test_size=0.20,\n",
" random_state=42,\n",
" stratify=df_all[\"label_bin\"]\n",
")\n",
"\n",
"print(f\"▶︎ Train size: {len(X_train)} Test size: {len(X_test)}\")\n",
"print(\"▶︎ Label counts in train:\", y_train.value_counts().to_dict())\n",
"print(\"▶︎ Label counts in test: \", y_test.value_counts().to_dict())\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== Baseline (TF‑IDF + LR) ===\n",
"Accuracy: 0.5312\n",
"F1 Score: 0.5928\n",
"\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.43 0.47 0.45 836\n",
" 1 0.61 0.58 0.59 1218\n",
"\n",
" accuracy 0.53 2054\n",
" macro avg 0.52 0.52 0.52 2054\n",
"weighted avg 0.54 0.53 0.53 2054\n",
"\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 500x400 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# (4) Baseline: TF‑IDF + Logistic Regression\n",
"\n",
"baseline_pipe = Pipeline([\n",
" (\"tfidf\", TfidfVectorizer(\n",
" stop_words=\"english\",\n",
" max_features=5000,\n",
" ngram_range=(1, 2)\n",
" )),\n",
" (\"clf\", LogisticRegression(\n",
" max_iter=1000,\n",
" class_weight=\"balanced\"\n",
" ))\n",
"])\n",
"\n",
"baseline_pipe.fit(X_train, y_train)\n",
"y_pred_baseline = baseline_pipe.predict(X_test)\n",
"\n",
"acc_baseline = accuracy_score(y_test, y_pred_baseline)\n",
"f1_baseline = f1_score(y_test, y_pred_baseline)\n",
"probs_baseline = baseline_pipe.predict_proba(X_test)[:, 1]\n",
"\n",
"print(\"=== Baseline (TF‑IDF + LR) ===\")\n",
"print(f\"Accuracy: {acc_baseline:.4f}\")\n",
"print(f\"F1 Score: {f1_baseline:.4f}\\n\")\n",
"print(\"Classification Report:\")\n",
"print(classification_report(y_test, y_pred_baseline))\n",
"\n",
"# Baseline confusion matrix\n",
"cm_base = confusion_matrix(y_test, y_pred_baseline)\n",
"plt.figure(figsize=(5,4))\n",
"sns.heatmap(\n",
" cm_base,\n",
" annot=True,\n",
" fmt=\"d\",\n",
" cmap=\"Blues\",\n",
" xticklabels=[\"Real\", \"Fake\"],\n",
" yticklabels=[\"Real\", \"Fake\"]\n",
")\n",
"plt.title(\"Confusion Matrix: Baseline\")\n",
"plt.xlabel(\"Predicted\")\n",
"plt.ylabel(\"Actual\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "3b741441dfd742dab67288cb96bb39cf",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Map: 0%| | 0/8215 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "5f4e19c7cb614004a2b9220510e8928c",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Map: 0%| | 0/2054 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "919b27a90c324845b0cec283bcd4d27e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Map: 0%| | 0/8215 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9bb1cc4385a145e996c4983200f1c09e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Map: 0%| | 0/2054 [00:00<?, ? examples/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"=== BERT (fine‑tuned) ===\n",
"Accuracy: 0.5779\n",
"F1 Score: 0.6742\n",
"\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.48 0.34 0.40 849\n",
" 1 0.62 0.74 0.67 1205\n",
"\n",
" accuracy 0.58 2054\n",
" macro avg 0.55 0.54 0.54 2054\n",
"weighted avg 0.56 0.58 0.56 2054\n",
"\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 500x400 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# (5) Load & Evaluate Fine‑Tuned BERT\n",
"\n",
"# 5.1 Load tokenizer + model from your local folder\n",
"model_dir = \"../models/bert-fake-news\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_dir)\n",
"model = AutoModelForSequenceClassification.from_pretrained(model_dir)\n",
"model = model.to(device)\n",
"model.eval()\n",
"\n",
"# 5.2 Tokenize test set\n",
"def tokenize_fn(batch):\n",
" return tokenizer(\n",
" batch[\"statement\"],\n",
" padding=\"max_length\",\n",
" truncation=True,\n",
" max_length=128\n",
" )\n",
"\n",
"# Recreate the dataset splits for BERT evaluation\n",
"splits = raw[\"train\"].train_test_split(test_size=0.20, seed=42)\n",
"splits[\"train\"] = splits[\"train\"].map(lambda x: {\"label\": int(x[\"label\"] <= 2)})\n",
"splits[\"test\"] = splits[\"test\"].map(lambda x: {\"label\": int(x[\"label\"] <= 2)})\n",
"\n",
"# Apply tokenization\n",
"tokenized = splits.map(tokenize_fn, batched=True)\n",
"tokenized.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"label\"])\n",
"\n",
"# 5.3 Run inference on test set\n",
"test_loader = DataLoader(tokenized[\"test\"], batch_size=8)\n",
"all_preds, all_probs, all_labels = [], [], []\n",
"\n",
"with torch.no_grad():\n",
" for batch in test_loader:\n",
" ids = batch[\"input_ids\"].to(device)\n",
" mask = batch[\"attention_mask\"].to(device)\n",
" labels = batch[\"label\"].to(device)\n",
"\n",
" outputs = model(ids, attention_mask=mask)\n",
" logits = outputs.logits\n",
" probs = torch.softmax(logits, dim=-1)\n",
" preds = torch.argmax(probs, dim=-1)\n",
"\n",
" all_preds.extend(preds.cpu().tolist())\n",
" # take probability of class “1” (fake)\n",
" all_probs.extend(probs[:, 1].cpu().tolist())\n",
" all_labels.extend(labels.cpu().tolist())\n",
"\n",
"acc_bert = accuracy_score(all_labels, all_preds)\n",
"f1_bert = f1_score(all_labels, all_preds)\n",
"\n",
"print(\"=== BERT (fine‑tuned) ===\")\n",
"print(f\"Accuracy: {acc_bert:.4f}\")\n",
"print(f\"F1 Score: {f1_bert:.4f}\\n\")\n",
"print(\"Classification Report:\")\n",
"print(classification_report(all_labels, all_preds))\n",
"\n",
"# BERT confusion matrix\n",
"cm_bert = confusion_matrix(all_labels, all_preds)\n",
"plt.figure(figsize=(5,4))\n",
"sns.heatmap(\n",
" cm_bert,\n",
" annot=True,\n",
" fmt=\"d\",\n",
" cmap=\"Greens\",\n",
" xticklabels=[\"Real\", \"Fake\"],\n",
" yticklabels=[\"Real\", \"Fake\"]\n",
")\n",
"plt.title(\"Confusion Matrix: BERT\")\n",
"plt.xlabel(\"Predicted\")\n",
"plt.ylabel(\"Actual\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 600x500 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# (6) ROC Curves & AUC\n",
"\n",
"fpr_base, tpr_base, _ = roc_curve(y_test, probs_baseline)\n",
"auc_base = auc(fpr_base, tpr_base)\n",
"\n",
"fpr_bert, tpr_bert, _ = roc_curve(all_labels, all_probs)\n",
"auc_bert = auc(fpr_bert, tpr_bert)\n",
"\n",
"plt.figure(figsize=(6,5))\n",
"plt.plot(fpr_base, tpr_base, label=f\"Baseline (AUC = {auc_base:.2f})\", linestyle=\"--\")\n",
"plt.plot(fpr_bert, tpr_bert, label=f\"BERT (AUC = {auc_bert:.2f})\")\n",
"plt.plot([0,1], [0,1], color=\"gray\", linestyle=\":\")\n",
"plt.xlabel(\"False Positive Rate\")\n",
"plt.ylabel(\"True Positive Rate\")\n",
"plt.title(\"ROC Curve: Baseline vs. BERT\")\n",
"plt.legend(loc=\"lower right\")\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Model</th>\n",
" <th>Accuracy</th>\n",
" <th>F1</th>\n",
" <th>AUC</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>TF‑IDF + LR</td>\n",
" <td>0.531159</td>\n",
" <td>0.592812</td>\n",
" <td>0.544498</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>BERT (fine‑tuned)</td>\n",
" <td>0.577897</td>\n",
" <td>0.674183</td>\n",
" <td>0.566729</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Model Accuracy F1 AUC\n",
"0 TF‑IDF + LR 0.531159 0.592812 0.544498\n",
"1 BERT (fine‑tuned) 0.577897 0.674183 0.566729"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# (7) Side‑by‑Side Comparison Table\n",
"\n",
"comparison = pd.DataFrame([\n",
" {\"Model\": \"TF‑IDF + LR\", \"Accuracy\": acc_baseline, \"F1\": f1_baseline, \"AUC\": auc_base},\n",
" {\"Model\": \"BERT (fine‑tuned)\", \"Accuracy\": acc_bert, \"F1\": f1_bert, \"AUC\": auc_bert}\n",
"])\n",
"\n",
"comparison\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"\n",
"## 📋 Final Summary\n",
"\n",
"**Baseline (TF‑IDF + Logistic Regression)** \n",
"- Accuracy: 0.531 \n",
"- F1 Score: 0.593 \n",
"- AUC: 0.544 \n",
"\n",
"**BERT (fine‑tuned)** \n",
"- Accuracy: 0.578 \n",
"- F1 Score: 0.674 \n",
"- AUC: 0.567 \n",
"\n",
"**Key Observation:** \n",
"> BERT’s contextual embeddings led to an ~0.081 absolute improvement in F1 over the TF‑IDF baseline, at the cost of roughly 10–15× slower inference on CPU. For production, consider DistilBERT or quantized BERT to balance speed vs. accuracy.\n"
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from IPython.display import Markdown, display\n",
"\n",
"display(Markdown(f\"\"\"\n",
"## Final Summary\n",
"\n",
"**Baseline (TF‑IDF + Logistic Regression)** \n",
"- Accuracy: {acc_baseline:.3f} \n",
"- F1 Score: {f1_baseline:.3f} \n",
"- AUC: {auc_base:.3f} \n",
"\n",
"**BERT (fine‑tuned)** \n",
"- Accuracy: {acc_bert:.3f} \n",
"- F1 Score: {f1_bert:.3f} \n",
"- AUC: {auc_bert:.3f} \n",
"\n",
"**Key Observation:** \n",
"> BERT’s contextual embeddings led to an ~{f1_bert - f1_baseline:.3f} absolute improvement in F1 over the TF‑IDF baseline, at the cost of roughly 10–15× slower inference on CPU. For production, consider DistilBERT or quantized BERT to balance speed vs. accuracy.\n",
"\"\"\"))\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|