{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "import torch.nn.functional as F\n",
    "import matplotlib.pyplot as plt # for making figures\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "'wget' is not recognized as an internal or external command,\n",
      "operable program or batch file.\n"
     ]
    }
   ],
   "source": [
    "# download the names.txt file from github\n",
    "!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "32033\n",
      "15\n",
      "['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']\n"
     ]
    }
   ],
   "source": [
    "# read in all the words\n",
    "words = open('names.txt', 'r').read().splitlines()\n",
    "print(len(words))\n",
    "print(max(len(w) for w in words))\n",
    "print(words[:8])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}\n",
      "27\n"
     ]
    }
   ],
   "source": [
    "# build the vocabulary of characters and mappings to/from integers\n",
    "chars = sorted(list(set(''.join(words))))\n",
    "stoi = {s:i+1 for i,s in enumerate(chars)}\n",
    "stoi['.'] = 0\n",
    "itos = {i:s for s,i in stoi.items()}\n",
    "vocab_size = len(itos)\n",
    "print(itos)\n",
    "print(vocab_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([182625, 3]) torch.Size([182625])\n",
      "torch.Size([22655, 3]) torch.Size([22655])\n",
      "torch.Size([22866, 3]) torch.Size([22866])\n"
     ]
    }
   ],
   "source": [
    "# build the dataset\n",
    "block_size = 3 # context length: how many characters do we take to predict the next one?\n",
    "\n",
    "def build_dataset(words):\n",
    "  X, Y = [], []\n",
    "\n",
    "  for w in words:\n",
    "    context = [0] * block_size\n",
    "    for ch in w + '.':\n",
    "      ix = stoi[ch]\n",
    "      X.append(context)\n",
    "      Y.append(ix)\n",
    "      context = context[1:] + [ix] # crop and append\n",
    "\n",
    "  X = torch.tensor(X)\n",
    "  Y = torch.tensor(Y)\n",
    "  print(X.shape, Y.shape)\n",
    "  return X, Y\n",
    "\n",
    "import random\n",
    "random.seed(42)\n",
    "random.shuffle(words)\n",
    "n1 = int(0.8*len(words))\n",
    "n2 = int(0.9*len(words))\n",
    "\n",
    "Xtr,  Ytr  = build_dataset(words[:n1])     # 80%\n",
    "Xdev, Ydev = build_dataset(words[n1:n2])   # 10%\n",
    "Xte,  Yte  = build_dataset(words[n2:])     # 10%"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# utility function we will use later when comparing manual gradients to PyTorch gradients\n",
    "def cmp(s, dt, t):\n",
    "  ex = torch.all(dt == t.grad).item()\n",
    "  app = torch.allclose(dt, t.grad)\n",
    "  maxdiff = (dt - t.grad).abs().max().item()\n",
    "  print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---------"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "[1:50:03](https://youtu.be/q8SA3rM6ckI?si=HV1qfgMFKATlDxk1&t=6603) to 1:54:25 - Putting all of the codes together to form a Neural Net, but by commenting out the `loss.backward()` :)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "12297\n",
      "      0/ 200000: 3.8069\n",
      "  10000/ 200000: 2.1598\n",
      "  20000/ 200000: 2.4110\n",
      "  30000/ 200000: 2.4295\n",
      "  40000/ 200000: 2.0158\n",
      "  50000/ 200000: 2.4050\n",
      "  60000/ 200000: 2.3825\n",
      "  70000/ 200000: 2.0596\n",
      "  80000/ 200000: 2.3024\n",
      "  90000/ 200000: 2.2073\n",
      " 100000/ 200000: 2.0443\n",
      " 110000/ 200000: 2.2937\n",
      " 120000/ 200000: 2.0340\n",
      " 130000/ 200000: 2.4557\n",
      " 140000/ 200000: 2.2876\n",
      " 150000/ 200000: 2.2016\n",
      " 160000/ 200000: 1.9720\n",
      " 170000/ 200000: 1.8015\n",
      " 180000/ 200000: 2.0065\n",
      " 190000/ 200000: 1.9932\n"
     ]
    }
   ],
   "source": [
    "# Exercise 4: putting it all together!\n",
    "# Train the MLP neural net with your own backward pass\n",
    "\n",
    "# init\n",
    "n_embd = 10 # the dimensionality of the character embedding vectors\n",
    "n_hidden = 200 # the number of neurons in the hidden layer of the MLP\n",
    "\n",
    "g = torch.Generator().manual_seed(2147483647) # for reproducibility\n",
    "C  = torch.randn((vocab_size, n_embd),            generator=g)\n",
    "# Layer 1\n",
    "W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5)\n",
    "b1 = torch.randn(n_hidden,                        generator=g) * 0.1\n",
    "# Layer 2\n",
    "W2 = torch.randn((n_hidden, vocab_size),          generator=g) * 0.1\n",
    "b2 = torch.randn(vocab_size,                      generator=g) * 0.1\n",
    "# BatchNorm parameters\n",
    "bngain = torch.randn((1, n_hidden))*0.1 + 1.0\n",
    "bnbias = torch.randn((1, n_hidden))*0.1\n",
    "\n",
    "parameters = [C, W1, b1, W2, b2, bngain, bnbias]\n",
    "print(sum(p.nelement() for p in parameters)) # number of parameters in total\n",
    "for p in parameters:\n",
    "  p.requires_grad = True\n",
    "\n",
    "# same optimization as last time\n",
    "max_steps = 200000\n",
    "batch_size = 32\n",
    "n = batch_size # convenience\n",
    "lossi = []\n",
    "\n",
    "# use this context manager for efficiency once your backward pass is written (TODO)\n",
    "with torch.no_grad():\n",
    "\n",
    "  # kick off optimization\n",
    "  for i in range(max_steps):\n",
    "\n",
    "    # minibatch construct\n",
    "    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)\n",
    "    Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y\n",
    "\n",
    "    # forward pass\n",
    "    emb = C[Xb] # embed the characters into vectors\n",
    "    embcat = emb.view(emb.shape[0], -1) # concatenate the vectors\n",
    "    # Linear layer\n",
    "    hprebn = embcat @ W1 + b1 # hidden layer pre-activation\n",
    "    # BatchNorm layer\n",
    "    # -------------------------------------------------------------\n",
    "    bnmean = hprebn.mean(0, keepdim=True)\n",
    "    bnvar = hprebn.var(0, keepdim=True, unbiased=True)\n",
    "    bnvar_inv = (bnvar + 1e-5)**-0.5\n",
    "    bnraw = (hprebn - bnmean) * bnvar_inv\n",
    "    hpreact = bngain * bnraw + bnbias\n",
    "    # -------------------------------------------------------------\n",
    "    # Non-linearity\n",
    "    h = torch.tanh(hpreact) # hidden layer\n",
    "    logits = h @ W2 + b2 # output layer\n",
    "    loss = F.cross_entropy(logits, Yb) # loss function\n",
    "\n",
    "    # backward pass\n",
    "    for p in parameters:\n",
    "      p.grad = None\n",
    "    #loss.backward() # use this for correctness comparisons, delete it later!\n",
    "\n",
    "    # manual backprop! #swole_doge_meme\n",
    "    # -----------------\n",
    "    dlogits = F.softmax(logits, 1)\n",
    "    dlogits[range(n), Yb] -= 1\n",
    "    dlogits /= n\n",
    "    # 2nd layer backprop\n",
    "    dh = dlogits @ W2.T\n",
    "    dW2 = h.T @ dlogits\n",
    "    db2 = dlogits.sum(0)\n",
    "    # tanh\n",
    "    dhpreact = (1.0 - h**2) * dh\n",
    "    # batchnorm backprop\n",
    "    dbngain = (bnraw * dhpreact).sum(0, keepdim=True)\n",
    "    dbnbias = dhpreact.sum(0, keepdim=True)\n",
    "    dhprebn = bngain*bnvar_inv/n * (n*dhpreact - dhpreact.sum(0) - n/(n-1)*bnraw*(dhpreact*bnraw).sum(0))\n",
    "    # 1st layer\n",
    "    dembcat = dhprebn @ W1.T\n",
    "    dW1 = embcat.T @ dhprebn\n",
    "    db1 = dhprebn.sum(0)\n",
    "    # embedding\n",
    "    demb = dembcat.view(emb.shape)\n",
    "    dC = torch.zeros_like(C)\n",
    "    for k in range(Xb.shape[0]):\n",
    "      for j in range(Xb.shape[1]):\n",
    "        ix = Xb[k,j]\n",
    "        dC[ix] += demb[k,j]\n",
    "    grads = [dC, dW1, db1, dW2, db2, dbngain, dbnbias]\n",
    "    # -----------------\n",
    "\n",
    "    # update\n",
    "    lr = 0.1 if i < 100000 else 0.01 # step learning rate decay\n",
    "    for p, grad in zip(parameters, grads):\n",
    "      #p.data += -lr * p.grad # old way of cheems doge (using PyTorch grad from .backward())\n",
    "      p.data += -lr * grad # new way of swole doge TODO: enable\n",
    "\n",
    "    # track stats\n",
    "    if i % 10000 == 0: # print every once in a while\n",
    "      print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')\n",
    "    lossi.append(loss.log10().item())\n",
    "\n",
    "  #   if i >= 100: # TODO: delete early breaking when you're ready to train the full net\n",
    "  #     break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Looking at this, probably the batch norm layer backward pass was the most complicated one\n",
    "# Otherwise, the rest of them were pretty straight forward :)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# useful for checking your gradients\n",
    "# for p,g in zip(parameters, grads):\n",
    "#   cmp(str(tuple(p.shape)), g, p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# calibrate the batch norm at the end of training\n",
    "\n",
    "with torch.no_grad():\n",
    "  # pass the training set through\n",
    "  emb = C[Xtr]\n",
    "  embcat = emb.view(emb.shape[0], -1)\n",
    "  hpreact = embcat @ W1 + b1\n",
    "  # measure the mean/std over the entire training set\n",
    "  bnmean = hpreact.mean(0, keepdim=True)\n",
    "  bnvar = hpreact.var(0, keepdim=True, unbiased=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train 2.0708959102630615\n",
      "val 2.1080715656280518\n"
     ]
    }
   ],
   "source": [
    "# evaluate train and val loss\n",
    "\n",
    "@torch.no_grad() # this decorator disables gradient tracking\n",
    "def split_loss(split):\n",
    "  x,y = {\n",
    "    'train': (Xtr, Ytr),\n",
    "    'val': (Xdev, Ydev),\n",
    "    'test': (Xte, Yte),\n",
    "  }[split]\n",
    "  emb = C[x] # (N, block_size, n_embd)\n",
    "  embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)\n",
    "  hpreact = embcat @ W1 + b1\n",
    "  hpreact = bngain * (hpreact - bnmean) * (bnvar + 1e-5)**-0.5 + bnbias\n",
    "  h = torch.tanh(hpreact) # (N, n_hidden)\n",
    "  logits = h @ W2 + b2 # (N, vocab_size)\n",
    "  loss = F.cross_entropy(logits, y)\n",
    "  print(split, loss.item())\n",
    "\n",
    "split_loss('train')\n",
    "split_loss('val')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Okay probably relatively slightly lower but thats cool"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "mora.\n",
      "mayah.\n",
      "see.\n",
      "mad.\n",
      "ryla.\n",
      "reisha.\n",
      "endraegan.\n",
      "chedielin.\n",
      "shi.\n",
      "jen.\n",
      "eden.\n",
      "sana.\n",
      "arleigh.\n",
      "malaia.\n",
      "noshubergshira.\n",
      "sten.\n",
      "joselle.\n",
      "jose.\n",
      "casubenteda.\n",
      "jamell.\n"
     ]
    }
   ],
   "source": [
    "# sample from the model\n",
    "g = torch.Generator().manual_seed(2147483647 + 10)\n",
    "\n",
    "for _ in range(20):\n",
    "    \n",
    "    out = []\n",
    "    context = [0] * block_size # initialize with all ...\n",
    "    while True:\n",
    "      # ------------\n",
    "      # forward pass:\n",
    "      # Embedding\n",
    "      emb = C[torch.tensor([context])] # (1,block_size,d)      \n",
    "      embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)\n",
    "      hpreact = embcat @ W1 + b1\n",
    "      hpreact = bngain * (hpreact - bnmean) * (bnvar + 1e-5)**-0.5 + bnbias\n",
    "      h = torch.tanh(hpreact) # (N, n_hidden)\n",
    "      logits = h @ W2 + b2 # (N, vocab_size)\n",
    "      # ------------\n",
    "      # Sample\n",
    "      probs = F.softmax(logits, dim=1)\n",
    "      ix = torch.multinomial(probs, num_samples=1, generator=g).item()\n",
    "      context = context[1:] + [ix]\n",
    "      out.append(ix)\n",
    "      if ix == 0:\n",
    "        break\n",
    "    \n",
    "    print(''.join(itos[i] for i in out))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# I've definetly got some wayyy better names here through most are gibberish xD"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And that marks the end of exploring the basic understanding of the 'intuition' of training NN using (traditional) methods. We will be moving on to more complex ones from here on - like RNN etc. So looking forward to that :)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}