{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# This exercise wasn't exactly smooth sailing for me, but I did try to understand most of it. Will try to come back to this whenever I can" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# there no change change in the first several cells from last lecture\n", "\n", "import torch\n", "import torch.nn.functional as F\n", "import matplotlib.pyplot as plt # for making figures\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# download the names.txt file from github\n", "!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# read in all the words\n", "words = open('names.txt', 'r').read().splitlines()\n", "# print(len(words))\n", "# print(max(len(w) for w in words))\n", "# print(words[:8])\n", "\n", "# build the vocabulary of characters and mappings to/from integers\n", "chars = sorted(list(set(''.join(words))))\n", "stoi = {s:i+1 for i,s in enumerate(chars)}\n", "stoi['.'] = 0\n", "itos = {i:s for s,i in stoi.items()}\n", "vocab_size = len(itos)\n", "# print(itos)\n", "# print(vocab_size)\n", "\n", "# build the dataset\n", "block_size = 3 # context length: how many characters do we take to predict the next one?\n", "\n", "def build_dataset(words):\n", " X, Y = [], []\n", "\n", " for w in words:\n", " context = [0] * block_size\n", " for ch in w + '.':\n", " ix = stoi[ch]\n", " X.append(context)\n", " Y.append(ix)\n", " context = context[1:] + [ix] # crop and append\n", "\n", " X = torch.tensor(X)\n", " Y = torch.tensor(Y)\n", " # print(X.shape, Y.shape)\n", " return X, Y\n", "\n", "import random\n", "random.seed(42)\n", "random.shuffle(words)\n", "n1 = int(0.8*len(words))\n", "n2 = int(0.9*len(words))\n", "\n", "Xtr, Ytr = build_dataset(words[:n1]) # 80%\n", "Xdev, Ydev = build_dataset(words[n1:n2]) # 10%\n", "Xte, Yte = build_dataset(words[n2:]) # 10%" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# utility function we will use later when comparing manual gradients to PyTorch gradients\n", "def cmp(s, dt, t):\n", " ex = torch.all(dt == t.grad).item()\n", " app = torch.allclose(dt, t.grad)\n", " maxdiff = (dt - t.grad).abs().max().item()\n", " print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4137\n" ] } ], "source": [ "n_embd = 10 # the dimensionality of the character embedding vectors\n", "n_hidden = 64 # the number of neurons in the hidden layer of the MLP\n", "\n", "g = torch.Generator().manual_seed(2147483647) # for reproducibility\n", "C = torch.randn((vocab_size, n_embd), generator=g)\n", "# Layer 1\n", "W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5)\n", "b1 = torch.randn(n_hidden, generator=g) * 0.1 # using b1 just for fun, it's useless because of BN\n", "# Layer 2\n", "W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.1\n", "b2 = torch.randn(vocab_size, generator=g) * 0.1\n", "# BatchNorm parameters\n", "bngain = torch.randn((1, n_hidden))*0.1 + 1.0\n", "bnbias = torch.randn((1, n_hidden))*0.1\n", "\n", "# Note: I am initializating many of these parameters in non-standard ways\n", "# because sometimes initializating with e.g. all zeros could mask an incorrect\n", "# implementation of the backward pass.\n", "\n", "parameters = [C, W1, b1, W2, b2, bngain, bnbias]\n", "print(sum(p.nelement() for p in parameters)) # number of parameters in total\n", "for p in parameters:\n", " p.requires_grad = True" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "batch_size = 32\n", "n = batch_size # a shorter variable also, for convenience\n", "# construct a minibatch\n", "ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)\n", "Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor(3.3596, grad_fn=)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# forward pass, \"chunkated\" into smaller steps that are possible to backward one at a time\n", "\n", "emb = C[Xb] # embed the characters into vectors\n", "embcat = emb.view(emb.shape[0], -1) # concatenate the vectors\n", "# Linear layer 1\n", "hprebn = embcat @ W1 + b1 # hidden layer pre-activation\n", "# BatchNorm layer\n", "bnmeani = 1/n*hprebn.sum(0, keepdim=True)\n", "bndiff = hprebn - bnmeani\n", "bndiff2 = bndiff**2\n", "bnvar = 1/(n-1)*(bndiff2).sum(0, keepdim=True) # note: Bessel's correction (dividing by n-1, not n)\n", "bnvar_inv = (bnvar + 1e-5)**-0.5\n", "bnraw = bndiff * bnvar_inv\n", "hpreact = bngain * bnraw + bnbias\n", "# Non-linearity\n", "h = torch.tanh(hpreact) # hidden layer\n", "# Linear layer 2\n", "logits = h @ W2 + b2 # output layer\n", "# cross entropy loss (same as F.cross_entropy(logits, Yb))\n", "logit_maxes = logits.max(1, keepdim=True).values\n", "norm_logits = logits - logit_maxes # subtract max for numerical stability\n", "counts = norm_logits.exp()\n", "counts_sum = counts.sum(1, keepdims=True)\n", "counts_sum_inv = counts_sum**-1 # if I use (1.0 / counts_sum) instead then I can't get backprop to be bit exact...\n", "probs = counts * counts_sum_inv\n", "logprobs = probs.log()\n", "loss = -logprobs[range(n), Yb].mean()\n", "\n", "# PyTorch backward pass\n", "for p in parameters:\n", " p.grad = None\n", "for t in [logprobs, probs, counts, counts_sum, counts_sum_inv, # afaik there is no cleaner way\n", " norm_logits, logit_maxes, logits, h, hpreact, bnraw,\n", " bnvar_inv, bnvar, bndiff2, bndiff, hprebn, bnmeani,\n", " embcat, emb]:\n", " t.retain_grad()\n", "loss.backward()\n", "loss" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Similar boiler plate codes as done in the prev exercise and provided in the starter code^\n", "\n", "------------" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Exercise 2: backprop through cross_entropy but all in one go\n", "# to complete this challenge look at the mathematical expression of the loss,\n", "# take the derivative, simplify the expression, and just write it out\n", "\n", "# forward pass\n", "\n", "# before:\n", "# logit_maxes = logits.max(1, keepdim=True).values\n", "# norm_logits = logits - logit_maxes # subtract max for numerical stability\n", "# counts = norm_logits.exp()\n", "# counts_sum = counts.sum(1, keepdims=True)\n", "# counts_sum_inv = counts_sum**-1 # if I use (1.0 / counts_sum) instead then I can't get backprop to be bit exact...\n", "# probs = counts * counts_sum_inv\n", "# logprobs = probs.log()\n", "# loss = -logprobs[range(n), Yb].mean()\n", "\n", "# now:\n", "# loss_fast = F.cross_entropy(logits, Yb)\n", "# print(loss_fast.item(), 'diff:', (loss_fast - loss).item())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In the above example we are seeing how the forward pass is broken down if we do the manual breakdown of calculation vs just directly using PyTorch" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[1:28:34](https://youtu.be/q8SA3rM6ckI?si=O-RCp2YO7QbSbUIW&t=5314) to 1:32:48 - Andrej sensei gives us an hint followed with an explaination of solving the equation and convert that to code" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "logits | exact: False | approximate: True | maxdiff: 8.381903171539307e-09\n" ] } ], "source": [ "# backward pass\n", "\n", "dlogits = F.softmax(logits, 1)\n", "dlogits[range(n), Yb] -= 1\n", "dlogits /= n\n", "\n", "cmp('logits', dlogits, logits)\n", "\n", "#This wasnt exactly very clear to me, but i will come back to this\n", "#Also my output came slightly bigger than sensei's though" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[1:32:49](https://youtu.be/q8SA3rM6ckI?si=-204uFZWpJPaT9oU&t=5569) to 1:36:36 - Breakdown of what `dlogits` actually is by taking one row and representing it dynamically" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(torch.Size([32, 27]), torch.Size([32]))" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "logits.shape, Yb.shape" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([0.0727, 0.0823, 0.0164, 0.0532, 0.0213, 0.0895, 0.0218, 0.0357, 0.0174,\n", " 0.0327, 0.0371, 0.0337, 0.0347, 0.0311, 0.0346, 0.0131, 0.0086, 0.0178,\n", " 0.0161, 0.0499, 0.0532, 0.0226, 0.0259, 0.0712, 0.0607, 0.0274, 0.0192],\n", " grad_fn=)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "F.softmax(logits, 1)[0]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([ 0.0727, 0.0823, 0.0164, 0.0532, 0.0213, 0.0895, 0.0218, 0.0357,\n", " -0.9826, 0.0327, 0.0371, 0.0337, 0.0347, 0.0311, 0.0346, 0.0131,\n", " 0.0086, 0.0178, 0.0161, 0.0499, 0.0532, 0.0226, 0.0259, 0.0712,\n", " 0.0607, 0.0274, 0.0192], grad_fn=)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dlogits[0] * n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor(2.0955e-09, grad_fn=)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dlogits[0].sum()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.figure(figsize=(8,8))\n", "plt.imshow(dlogits.detach(), cmap='gray')" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 2 }