Spaces:

dev100rabh
/

FakeNews

Sleeping

File size: 33,407 Bytes

769dd6f

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "# Fake News Detection using BERT-BiLSTM-Attention\n",
    "\n",
    "This notebook is optimized for Google Colab free version with the following optimizations:\n",
    "- Reduced model size\n",
    "- Optimized memory usage\n",
    "- Efficient data loading\n",
    "- Gradient checkpointing\n",
    "- Mixed precision training\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "vscode": {
     "languageId": "plaintext"
    }
   },
   "outputs": [],
   "source": [
    "## 1. Setup and Installation\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Install required packages\n",
    "!pip install torch==2.0.1 transformers==4.30.2 nltk==3.8.1 pandas==2.0.3 numpy==1.24.3 scikit-learn==1.3.0 tqdm==4.65.0\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import required libraries\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.optim as optim\n",
    "from torch.utils.data import Dataset, DataLoader\n",
    "from transformers import BertModel, BertTokenizer\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
    "import nltk\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.corpus import stopwords\n",
    "import re\n",
    "from tqdm import tqdm\n",
    "import gc\n",
    "\n",
    "# Download NLTK data\n",
    "nltk.download('punkt')\n",
    "nltk.download('stopwords')\n",
    "nltk.download('wordnet')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 2. Configuration and Constants\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optimized for Colab free version\n",
    "class Config:\n",
    "    # Model parameters\n",
    "    MAX_SEQUENCE_LENGTH = 128  # Reduced from 256\n",
    "    VOCAB_SIZE = 10000  # Reduced from 15000\n",
    "    EMBEDDING_DIM = 64  # Reduced from 128\n",
    "    HIDDEN_DIM = 128  # Reduced from 256\n",
    "    \n",
    "    # Training parameters\n",
    "    BATCH_SIZE = 4  # Reduced from 8\n",
    "    NUM_EPOCHS = 2  # Reduced from 3\n",
    "    LEARNING_RATE = 2e-5\n",
    "    \n",
    "    # Dataset parameters\n",
    "    MAX_SAMPLES = 5000  # Reduced from 10000\n",
    "    \n",
    "    # Device configuration\n",
    "    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "    \n",
    "    # Model paths\n",
    "    MODEL_NAME = 'bert-base-uncased'\n",
    "    \n",
    "    # Enable mixed precision\n",
    "    USE_AMP = True\n",
    "    \n",
    "    # Enable gradient checkpointing\n",
    "    USE_GRADIENT_CHECKPOINTING = True\n",
    "\n",
    "config = Config()\n",
    "print(f\"Using device: {config.DEVICE}\")\n",
    "print(f\"CUDA available: {torch.cuda.is_available()}\")\n",
    "if torch.cuda.is_available():\n",
    "    print(f\"GPU: {torch.cuda.get_device_name(0)}\")\n",
    "    print(f\"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 3. Data Loading and Preprocessing\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dataset Sources:\n",
    "# 1. Kaggle Fake and Real News Dataset: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset\n",
    "# 2. LIAR Dataset: https://sites.cs.ucsb.edu/~william/data/liar_dataset.zip\n",
    "\n",
    "import zipfile\n",
    "import urllib.request\n",
    "import os\n",
    "\n",
    "def download_datasets():\n",
    "    \"\"\"Download and prepare the datasets\"\"\"\n",
    "    \n",
    "    # Download LIAR dataset\n",
    "    print(\"Downloading LIAR dataset...\")\n",
    "    liar_url = \"https://sites.cs.ucsb.edu/~william/data/liar_dataset.zip\"\n",
    "    liar_zip = \"liar_dataset.zip\"\n",
    "    \n",
    "    try:\n",
    "        urllib.request.urlretrieve(liar_url, liar_zip)\n",
    "        \n",
    "        # Extract the zip file\n",
    "        with zipfile.ZipFile(liar_zip, 'r') as zip_ref:\n",
    "            zip_ref.extractall(\"liar_dataset/\")\n",
    "        \n",
    "        print(\"LIAR dataset downloaded and extracted successfully\")\n",
    "        os.remove(liar_zip)  # Clean up zip file\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"Error downloading LIAR dataset: {e}\")\n",
    "    \n",
    "    # For Kaggle dataset, we'll use a sample since direct download requires API key\n",
    "    print(\"Setting up Kaggle dataset alternative...\")\n",
    "    try:\n",
    "        # Try to download a sample of the Kaggle dataset\n",
    "        kaggle_url = \"https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv\"\n",
    "        urllib.request.urlretrieve(kaggle_url, \"kaggle_news_sample.csv\")\n",
    "        print(\"Kaggle sample dataset downloaded successfully\")\n",
    "    except Exception as e:\n",
    "        print(f\"Could not download Kaggle sample: {e}\")\n",
    "\n",
    "def load_liar_dataset(max_samples=None):\n",
    "    \"\"\"Load and process LIAR dataset\"\"\"\n",
    "    try:\n",
    "        # Load train, validation, and test sets\n",
    "        train_df = pd.read_csv(\"liar_dataset/train.tsv\", sep='\\t', header=None)\n",
    "        val_df = pd.read_csv(\"liar_dataset/valid.tsv\", sep='\\t', header=None)\n",
    "        test_df = pd.read_csv(\"liar_dataset/test.tsv\", sep='\\t', header=None)\n",
    "        \n",
    "        # Column names for LIAR dataset\n",
    "        columns = ['id', 'label', 'statement', 'subjects', 'speaker', 'speaker_job', \n",
    "                  'state_info', 'party_affiliation', 'barely_true_counts', 'false_counts',\n",
    "                  'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']\n",
    "        \n",
    "        train_df.columns = columns\n",
    "        val_df.columns = columns\n",
    "        test_df.columns = columns\n",
    "        \n",
    "        # Combine all datasets\n",
    "        df = pd.concat([train_df, val_df, test_df], ignore_index=True)\n",
    "        \n",
    "        # Convert labels to binary (fake/real)\n",
    "        # Consider 'false', 'barely-true', 'pants-fire' as fake (1)\n",
    "        # Consider 'true', 'mostly-true', 'half-true' as real (0)\n",
    "        fake_labels = ['false', 'barely-true', 'pants-fire']\n",
    "        df['binary_label'] = df['label'].apply(lambda x: 1 if x in fake_labels else 0)\n",
    "        \n",
    "        # Use statement as text\n",
    "        df = df[['statement', 'binary_label']].rename(columns={'statement': 'text', 'binary_label': 'label'})\n",
    "        \n",
    "        print(f\"LIAR dataset loaded: {len(df)} samples\")\n",
    "        return df\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"Error loading LIAR dataset: {e}\")\n",
    "        return None\n",
    "\n",
    "def load_kaggle_dataset(max_samples=None):\n",
    "    \"\"\"Load and process Kaggle dataset\"\"\"\n",
    "    try:\n",
    "        df = pd.read_csv(\"kaggle_news_sample.csv\")\n",
    "        \n",
    "        # Map labels to binary if needed\n",
    "        if 'label' in df.columns:\n",
    "            # Handle different label formats\n",
    "            if df['label'].dtype == 'object':\n",
    "                df['label'] = df['label'].map({'FAKE': 1, 'REAL': 0, 'fake': 1, 'real': 0})\n",
    "        \n",
    "        # Use appropriate text column\n",
    "        text_columns = ['text', 'title', 'content', 'article']\n",
    "        text_col = None\n",
    "        for col in text_columns:\n",
    "            if col in df.columns:\n",
    "                text_col = col\n",
    "                break\n",
    "        \n",
    "        if text_col:\n",
    "            df = df[[text_col, 'label']].rename(columns={text_col: 'text'})\n",
    "        \n",
    "        print(f\"Kaggle dataset loaded: {len(df)} samples\")\n",
    "        return df\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"Error loading Kaggle dataset: {e}\")\n",
    "        return None\n",
    "\n",
    "def load_combined_data(max_samples=config.MAX_SAMPLES):\n",
    "    \"\"\"Load and combine both datasets\"\"\"\n",
    "    \n",
    "    # Download datasets\n",
    "    download_datasets()\n",
    "    \n",
    "    # Load datasets\n",
    "    liar_df = load_liar_dataset()\n",
    "    kaggle_df = load_kaggle_dataset()\n",
    "    \n",
    "    # Combine datasets\n",
    "    dfs = []\n",
    "    if liar_df is not None:\n",
    "        dfs.append(liar_df)\n",
    "        print(f\"LIAR dataset: {len(liar_df)} samples\")\n",
    "    \n",
    "    if kaggle_df is not None:\n",
    "        dfs.append(kaggle_df)\n",
    "        print(f\"Kaggle dataset: {len(kaggle_df)} samples\")\n",
    "    \n",
    "    if dfs:\n",
    "        df = pd.concat(dfs, ignore_index=True)\n",
    "        print(f\"Combined dataset: {len(df)} samples\")\n",
    "    else:\n",
    "        # Fallback to dummy data\n",
    "        print(\"Creating dummy dataset for testing...\")\n",
    "        texts = [\n",
    "            \"President announces new economic policy to boost growth\",\n",
    "            \"Scientists confirm breakthrough in renewable energy technology\", \n",
    "            \"False: Celebrities endorse dangerous health treatment\",\n",
    "            \"Misleading: Government hiding alien contact information\",\n",
    "            \"Local community rallies to support flood victims\",\n",
    "            \"Breaking: Major scientific discovery changes understanding of physics\"\n",
    "        ] * (max_samples // 6)\n",
    "        \n",
    "        labels = [0, 0, 1, 1, 0, 0] * (max_samples // 6)\n",
    "        \n",
    "        df = pd.DataFrame({\n",
    "            'text': texts[:max_samples],\n",
    "            'label': labels[:max_samples]\n",
    "        })\n",
    "        print(f\"Created dummy dataset with {len(df)} samples\")\n",
    "    \n",
    "    # Remove missing values\n",
    "    df = df.dropna()\n",
    "    \n",
    "    # Sample data for faster training if needed\n",
    "    if max_samples and len(df) > max_samples:\n",
    "        df = df.sample(n=max_samples, random_state=42)\n",
    "        print(f\"Sampled to {len(df)} samples for faster training\")\n",
    "    \n",
    "    return df\n",
    "\n",
    "# Text preprocessing\n",
    "def preprocess_text(text):\n",
    "    if pd.isna(text):\n",
    "        return \"\"\n",
    "    text = str(text)\n",
    "    # Convert to lowercase\n",
    "    text = text.lower()\n",
    "    # Remove special characters but keep basic punctuation\n",
    "    text = re.sub(r'[^\\w\\s.,!?]', '', text)\n",
    "    # Remove extra whitespace\n",
    "    text = ' '.join(text.split())\n",
    "    # Limit length to prevent very long texts\n",
    "    text = text[:1000]  # Limit to 1000 characters\n",
    "    return text\n",
    "\n",
    "# Load the datasets\n",
    "print(\"Loading datasets...\")\n",
    "df = load_combined_data()\n",
    "print(f\"Final dataset shape: {df.shape}\")\n",
    "print(f\"Columns: {df.columns.tolist()}\")\n",
    "\n",
    "if len(df) > 0:\n",
    "    print(f\"Sample text: {df.iloc[0]['text'][:100]}...\")\n",
    "    print(f\"Label distribution:\")\n",
    "    print(df['label'].value_counts())\n",
    "    print(f\"Label distribution percentage:\")\n",
    "    print(df['label'].value_counts(normalize=True) * 100)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "### Optional: Download Kaggle Dataset Directly (If you have Kaggle API)\n",
    "\n",
    "If you have Kaggle API credentials, you can download the full dataset by running the following cells. Otherwise, the notebook will use alternative sources.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optional: Kaggle API setup (uncomment and run if you have Kaggle credentials)\n",
    "# !pip install kaggle\n",
    "# !mkdir -p ~/.kaggle\n",
    "# # Upload your kaggle.json file to Colab files, then run:\n",
    "# # !cp kaggle.json ~/.kaggle/\n",
    "# # !chmod 600 ~/.kaggle/kaggle.json\n",
    "\n",
    "# Download the full Kaggle dataset (uncomment if you have API access)\n",
    "# !kaggle datasets download -d clmentbisaillon/fake-and-real-news-dataset\n",
    "# !unzip fake-and-real-news-dataset.zip\n",
    "\n",
    "def load_full_kaggle_dataset():\n",
    "    \"\"\"Load the full Kaggle dataset if available\"\"\"\n",
    "    try:\n",
    "        # Try to load the full dataset files\n",
    "        fake_df = pd.read_csv(\"Fake.csv\")\n",
    "        real_df = pd.read_csv(\"True.csv\")\n",
    "        \n",
    "        # Add labels\n",
    "        fake_df['label'] = 1\n",
    "        real_df['label'] = 0\n",
    "        \n",
    "        # Combine datasets\n",
    "        df = pd.concat([fake_df, real_df], ignore_index=True)\n",
    "        \n",
    "        # Use title + text as the full text\n",
    "        if 'title' in df.columns and 'text' in df.columns:\n",
    "            df['full_text'] = df['title'] + \". \" + df['text']\n",
    "            df = df[['full_text', 'label']].rename(columns={'full_text': 'text'})\n",
    "        elif 'text' in df.columns:\n",
    "            df = df[['text', 'label']]\n",
    "        \n",
    "        print(f\"Full Kaggle dataset loaded: {len(df)} samples\")\n",
    "        return df\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"Full Kaggle dataset not available: {e}\")\n",
    "        return None\n",
    "\n",
    "# Try to load full Kaggle dataset\n",
    "full_kaggle_df = load_full_kaggle_dataset()\n",
    "if full_kaggle_df is not None:\n",
    "    print(\"Using full Kaggle dataset\")\n",
    "    # Update the df variable to use full dataset\n",
    "    df = load_combined_data()  # This will still use the combined approach if full isn't available\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create dataset class\n",
    "class FakeNewsDataset(Dataset):\n",
    "    def __init__(self, texts, labels, tokenizer, max_length):\n",
    "        self.texts = texts\n",
    "        self.labels = labels\n",
    "        self.tokenizer = tokenizer\n",
    "        self.max_length = max_length\n",
    "    \n",
    "    def __len__(self):\n",
    "        return len(self.texts)\n",
    "    \n",
    "    def __getitem__(self, idx):\n",
    "        text = str(self.texts[idx])\n",
    "        label = self.labels[idx]\n",
    "        \n",
    "        # Preprocess text\n",
    "        text = preprocess_text(text)\n",
    "        \n",
    "        encoding = self.tokenizer.encode_plus(\n",
    "            text,\n",
    "            add_special_tokens=True,\n",
    "            max_length=self.max_length,\n",
    "            padding='max_length',\n",
    "            truncation=True,\n",
    "            return_attention_mask=True,\n",
    "            return_tensors='pt'\n",
    "        )\n",
    "        \n",
    "        return {\n",
    "            'input_ids': encoding['input_ids'].flatten(),\n",
    "            'attention_mask': encoding['attention_mask'].flatten(),\n",
    "            'label': torch.tensor(label, dtype=torch.long)\n",
    "        }\n",
    "\n",
    "print(\"Dataset class created successfully\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 4. Model Architecture\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class FakeNewsModel(nn.Module):\n",
    "    def __init__(self, config):\n",
    "        super(FakeNewsModel, self).__init__()\n",
    "        \n",
    "        # BERT layer\n",
    "        self.bert = BertModel.from_pretrained(config.MODEL_NAME)\n",
    "        if config.USE_GRADIENT_CHECKPOINTING:\n",
    "            self.bert.gradient_checkpointing_enable()\n",
    "        \n",
    "        # BiLSTM layer\n",
    "        self.lstm = nn.LSTM(\n",
    "            input_size=768,  # BERT output size\n",
    "            hidden_size=config.HIDDEN_DIM,\n",
    "            num_layers=1,\n",
    "            batch_first=True,\n",
    "            bidirectional=True,\n",
    "            dropout=0.1\n",
    "        )\n",
    "        \n",
    "        # Attention layer\n",
    "        self.attention = nn.Sequential(\n",
    "            nn.Linear(config.HIDDEN_DIM * 2, config.HIDDEN_DIM),\n",
    "            nn.Tanh(),\n",
    "            nn.Linear(config.HIDDEN_DIM, 1)\n",
    "        )\n",
    "        \n",
    "        # Classification head\n",
    "        self.classifier = nn.Sequential(\n",
    "            nn.Dropout(0.3),\n",
    "            nn.Linear(config.HIDDEN_DIM * 2, 64),\n",
    "            nn.ReLU(),\n",
    "            nn.Dropout(0.2),\n",
    "            nn.Linear(64, 2)\n",
    "        )\n",
    "    \n",
    "    def forward(self, input_ids, attention_mask):\n",
    "        # BERT\n",
    "        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)[0]\n",
    "        \n",
    "        # BiLSTM\n",
    "        lstm_output, _ = self.lstm(bert_output)\n",
    "        \n",
    "        # Attention mechanism\n",
    "        attention_scores = self.attention(lstm_output)\n",
    "        attention_weights = torch.softmax(attention_scores, dim=1)\n",
    "        attended_output = torch.sum(attention_weights * lstm_output, dim=1)\n",
    "        \n",
    "        # Classification\n",
    "        logits = self.classifier(attended_output)\n",
    "        \n",
    "        return logits\n",
    "\n",
    "print(\"Model architecture defined successfully\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 5. Training Functions\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_epoch(model, train_loader, optimizer, criterion, scaler, config):\n",
    "    model.train()\n",
    "    total_loss = 0\n",
    "    \n",
    "    progress_bar = tqdm(train_loader, desc='Training')\n",
    "    for batch in progress_bar:\n",
    "        input_ids = batch['input_ids'].to(config.DEVICE)\n",
    "        attention_mask = batch['attention_mask'].to(config.DEVICE)\n",
    "        labels = batch['label'].to(config.DEVICE)\n",
    "        \n",
    "        optimizer.zero_grad()\n",
    "        \n",
    "        if config.USE_AMP and torch.cuda.is_available():\n",
    "            with torch.cuda.amp.autocast():\n",
    "                outputs = model(input_ids, attention_mask)\n",
    "                loss = criterion(outputs, labels)\n",
    "            \n",
    "            scaler.scale(loss).backward()\n",
    "            scaler.step(optimizer)\n",
    "            scaler.update()\n",
    "        else:\n",
    "            outputs = model(input_ids, attention_mask)\n",
    "            loss = criterion(outputs, labels)\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "        \n",
    "        total_loss += loss.item()\n",
    "        progress_bar.set_postfix({'loss': loss.item()})\n",
    "        \n",
    "        # Clear memory\n",
    "        del input_ids, attention_mask, labels, outputs, loss\n",
    "        if torch.cuda.is_available():\n",
    "            torch.cuda.empty_cache()\n",
    "    \n",
    "    return total_loss / len(train_loader)\n",
    "\n",
    "def evaluate(model, val_loader, criterion, config):\n",
    "    model.eval()\n",
    "    total_loss = 0\n",
    "    all_preds = []\n",
    "    all_labels = []\n",
    "    \n",
    "    with torch.no_grad():\n",
    "        progress_bar = tqdm(val_loader, desc='Evaluating')\n",
    "        for batch in progress_bar:\n",
    "            input_ids = batch['input_ids'].to(config.DEVICE)\n",
    "            attention_mask = batch['attention_mask'].to(config.DEVICE)\n",
    "            labels = batch['label'].to(config.DEVICE)\n",
    "            \n",
    "            outputs = model(input_ids, attention_mask)\n",
    "            loss = criterion(outputs, labels)\n",
    "            \n",
    "            total_loss += loss.item()\n",
    "            \n",
    "            preds = torch.argmax(outputs, dim=1)\n",
    "            all_preds.extend(preds.cpu().numpy())\n",
    "            all_labels.extend(labels.cpu().numpy())\n",
    "            \n",
    "            # Clear memory\n",
    "            del input_ids, attention_mask, labels, outputs, loss, preds\n",
    "            if torch.cuda.is_available():\n",
    "                torch.cuda.empty_cache()\n",
    "    \n",
    "    metrics = {\n",
    "        'loss': total_loss / len(val_loader),\n",
    "        'accuracy': accuracy_score(all_labels, all_preds),\n",
    "        'precision': precision_score(all_labels, all_preds, average='weighted'),\n",
    "        'recall': recall_score(all_labels, all_preds, average='weighted'),\n",
    "        'f1': f1_score(all_labels, all_preds, average='weighted')\n",
    "    }\n",
    "    \n",
    "    return metrics\n",
    "\n",
    "print(\"Training functions defined successfully\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 6. Main Training Process\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Setup training\n",
    "def setup_training(df, config):\n",
    "    # Ensure we have valid data\n",
    "    if df is None or len(df) == 0:\n",
    "        raise ValueError(\"No valid dataset available\")\n",
    "    \n",
    "    print(f\"Dataset info:\")\n",
    "    print(f\"- Total samples: {len(df)}\")\n",
    "    print(f\"- Label distribution: {df['label'].value_counts().to_dict()}\")\n",
    "    \n",
    "    # Preprocess data\n",
    "    print(\"Preprocessing text data...\")\n",
    "    texts = df['text'].apply(preprocess_text).values\n",
    "    labels = df['label'].values\n",
    "    \n",
    "    # Remove empty texts\n",
    "    valid_indices = [i for i, text in enumerate(texts) if len(text.strip()) > 0]\n",
    "    texts = texts[valid_indices]\n",
    "    labels = labels[valid_indices]\n",
    "    \n",
    "    print(f\"After preprocessing: {len(texts)} valid samples\")\n",
    "    \n",
    "    # Split data\n",
    "    train_texts, val_texts, train_labels, val_labels = train_test_split(\n",
    "        texts, labels, test_size=0.2, random_state=42, stratify=labels\n",
    "    )\n",
    "    \n",
    "    print(f\"Data split:\")\n",
    "    print(f\"- Train samples: {len(train_texts)}\")\n",
    "    print(f\"- Validation samples: {len(val_texts)}\")\n",
    "    print(f\"- Train label distribution: {pd.Series(train_labels).value_counts().to_dict()}\")\n",
    "    print(f\"- Val label distribution: {pd.Series(val_labels).value_counts().to_dict()}\")\n",
    "    \n",
    "    # Initialize tokenizer\n",
    "    print(\"Initializing BERT tokenizer...\")\n",
    "    tokenizer = BertTokenizer.from_pretrained(config.MODEL_NAME)\n",
    "    \n",
    "    # Create datasets\n",
    "    print(\"Creating datasets...\")\n",
    "    train_dataset = FakeNewsDataset(train_texts, train_labels, tokenizer, config.MAX_SEQUENCE_LENGTH)\n",
    "    val_dataset = FakeNewsDataset(val_texts, val_labels, tokenizer, config.MAX_SEQUENCE_LENGTH)\n",
    "    \n",
    "    # Create dataloaders\n",
    "    train_loader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True)\n",
    "    val_loader = DataLoader(val_dataset, batch_size=config.BATCH_SIZE)\n",
    "    \n",
    "    print(f\"DataLoaders created:\")\n",
    "    print(f\"- Train batches: {len(train_loader)}\")\n",
    "    print(f\"- Val batches: {len(val_loader)}\")\n",
    "    \n",
    "    # Initialize model\n",
    "    print(\"Initializing model...\")\n",
    "    model = FakeNewsModel(config).to(config.DEVICE)\n",
    "    \n",
    "    # Count parameters\n",
    "    total_params = sum(p.numel() for p in model.parameters())\n",
    "    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)\n",
    "    print(f\"Model parameters:\")\n",
    "    print(f\"- Total parameters: {total_params:,}\")\n",
    "    print(f\"- Trainable parameters: {trainable_params:,}\")\n",
    "    print(f\"- Model size (MB): {total_params * 4 / 1024 / 1024:.2f}\")\n",
    "    \n",
    "    # Initialize optimizer\n",
    "    optimizer = optim.AdamW(model.parameters(), lr=config.LEARNING_RATE, weight_decay=0.01)\n",
    "    \n",
    "    # Initialize loss function\n",
    "    criterion = nn.CrossEntropyLoss()\n",
    "    \n",
    "    # Initialize scaler for mixed precision\n",
    "    scaler = torch.cuda.amp.GradScaler() if config.USE_AMP and torch.cuda.is_available() else None\n",
    "    \n",
    "    return model, train_loader, val_loader, optimizer, criterion, scaler, tokenizer\n",
    "\n",
    "print(\"Training setup function defined successfully\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run the complete training pipeline\n",
    "def main():\n",
    "    print(\"Starting fake news detection training...\")\n",
    "    \n",
    "    # Setup training\n",
    "    model, train_loader, val_loader, optimizer, criterion, scaler, tokenizer = setup_training(df, config)\n",
    "    \n",
    "    # Training loop\n",
    "    best_val_loss = float('inf')\n",
    "    best_val_acc = 0.0\n",
    "    \n",
    "    print(f\"Starting training for {config.NUM_EPOCHS} epochs...\")\n",
    "    \n",
    "    for epoch in range(config.NUM_EPOCHS):\n",
    "        print(f'=== Epoch {epoch + 1}/{config.NUM_EPOCHS} ===')\n",
    "        \n",
    "        # Train\n",
    "        train_loss = train_epoch(model, train_loader, optimizer, criterion, scaler, config)\n",
    "        print(f'Train Loss: {train_loss:.4f}')\n",
    "        \n",
    "        # Evaluate\n",
    "        val_metrics = evaluate(model, val_loader, criterion, config)\n",
    "        print(f'Val Loss: {val_metrics[\"loss\"]:.4f}')\n",
    "        print(f'Val Accuracy: {val_metrics[\"accuracy\"]:.4f}')\n",
    "        print(f'Val Precision: {val_metrics[\"precision\"]:.4f}')\n",
    "        print(f'Val Recall: {val_metrics[\"recall\"]:.4f}')\n",
    "        print(f'Val F1: {val_metrics[\"f1\"]:.4f}')\n",
    "        \n",
    "        # Save best model\n",
    "        if val_metrics['accuracy'] > best_val_acc:\n",
    "            best_val_acc = val_metrics['accuracy']\n",
    "            best_val_loss = val_metrics['loss']\n",
    "            torch.save(model.state_dict(), 'best_model_colab.pt')\n",
    "            print(f'New best model saved! Accuracy: {best_val_acc:.4f}')\n",
    "        \n",
    "        # Clear memory\n",
    "        gc.collect()\n",
    "        if torch.cuda.is_available():\n",
    "            torch.cuda.empty_cache()\n",
    "    \n",
    "    print('Training completed!')\n",
    "    print(f'Best validation accuracy: {best_val_acc:.4f}')\n",
    "    print(f'Best validation loss: {best_val_loss:.4f}')\n",
    "    \n",
    "    return model, tokenizer\n",
    "\n",
    "# Run training\n",
    "model, tokenizer = main()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 7. Model Testing and Prediction\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict_single(text, model, tokenizer, config):\n",
    "    \"\"\"Predict if a single text is fake or real news\"\"\"\n",
    "    model.eval()\n",
    "    text = preprocess_text(text)\n",
    "    \n",
    "    encoding = tokenizer.encode_plus(\n",
    "        text,\n",
    "        add_special_tokens=True,\n",
    "        max_length=config.MAX_SEQUENCE_LENGTH,\n",
    "        padding='max_length',\n",
    "        truncation=True,\n",
    "        return_attention_mask=True,\n",
    "        return_tensors='pt'\n",
    "    )\n",
    "    \n",
    "    input_ids = encoding['input_ids'].to(config.DEVICE)\n",
    "    attention_mask = encoding['attention_mask'].to(config.DEVICE)\n",
    "    \n",
    "    with torch.no_grad():\n",
    "        outputs = model(input_ids, attention_mask)\n",
    "        probabilities = torch.softmax(outputs, dim=1)\n",
    "        prediction = torch.argmax(outputs, dim=1)\n",
    "        confidence = torch.max(probabilities, dim=1)[0]\n",
    "    \n",
    "    return {\n",
    "        'prediction': prediction.item(),\n",
    "        'label': 'FAKE' if prediction.item() == 1 else 'REAL',\n",
    "        'confidence': confidence.item(),\n",
    "        'probabilities': {\n",
    "            'REAL': probabilities[0][0].item(),\n",
    "            'FAKE': probabilities[0][1].item()\n",
    "        }\n",
    "    }\n",
    "\n",
    "# Test with sample texts\n",
    "test_texts = [\n",
    "    \"Breaking: Scientists discover new planet in our solar system\",\n",
    "    \"Local community comes together to help flood victims\",\n",
    "    \"Shocking: Aliens spotted in downtown area last night\",\n",
    "    \"Government announces new healthcare policy to benefit citizens\"\n",
    "]\n",
    "\n",
    "print(\"Testing model predictions:\")\n",
    "print(\"=\" * 50)\n",
    "\n",
    "for i, text in enumerate(test_texts, 1):\n",
    "    result = predict_single(text, model, tokenizer, config)\n",
    "    print(f\"Text {i}: {text[:60]}...\")\n",
    "    print(f\"Prediction: {result['label']} (Confidence: {result['confidence']:.3f})\")\n",
    "    print(f\"Probabilities: REAL={result['probabilities']['REAL']:.3f}, FAKE={result['probabilities']['FAKE']:.3f}\")\n",
    "    print(\"-\" * 50)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run the complete training pipeline\n",
    "def main():\n",
    "    print(\"Starting fake news detection training...\")\n",
    "    \n",
    "    # Setup training\n",
    "    model, train_loader, val_loader, optimizer, criterion, scaler, tokenizer = setup_training(df, config)\n",
    "    \n",
    "    # Training loop\n",
    "    best_val_loss = float('inf')\n",
    "    best_val_acc = 0.0\n",
    "    \n",
    "    print(f\"\\nStarting training for {config.NUM_EPOCHS} epochs...\")\n",
    "    \n",
    "    for epoch in range(config.NUM_EPOCHS):\n",
    "        print(f'\\n=== Epoch {epoch + 1}/{config.NUM_EPOCHS} ===')\n",
    "        \n",
    "        # Train\n",
    "        train_loss = train_epoch(model, train_loader, optimizer, criterion, scaler, config)\n",
    "        print(f'Train Loss: {train_loss:.4f}')\n",
    "        \n",
    "        # Evaluate\n",
    "        val_metrics = evaluate(model, val_loader, criterion, config)\n",
    "        print(f'Val Loss: {val_metrics[\\\"loss\\\"]:.4f}')\n",
    "        print(f'Val Accuracy: {val_metrics[\\\"accuracy\\\"]:.4f}')\n",
    "        print(f'Val Precision: {val_metrics[\\\"precision\\\"]:.4f}')\n",
    "        print(f'Val Recall: {val_metrics[\\\"recall\\\"]:.4f}')\n",
    "        print(f'Val F1: {val_metrics[\\\"f1\\\"]:.4f}')\n",
    "        \n",
    "        # Save best model\n",
    "        if val_metrics['accuracy'] > best_val_acc:\n",
    "            best_val_acc = val_metrics['accuracy']\n",
    "            best_val_loss = val_metrics['loss']\n",
    "            torch.save(model.state_dict(), 'best_model_colab.pt')\\n            print(f'New best model saved! Accuracy: {best_val_acc:.4f}')\\n        \\n        # Clear memory\\n        gc.collect()\\n        if torch.cuda.is_available():\\n            torch.cuda.empty_cache()\\n    \\n    print(f'\\\\nTraining completed!')\\n    print(f'Best validation accuracy: {best_val_acc:.4f}')\\n    print(f'Best validation loss: {best_val_loss:.4f}')\\n    \\n    return model, tokenizer\\n\\n# Run training\\nmodel, tokenizer = main()\n"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}