{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import torch.nn.functional as F\n", "import matplotlib.pyplot as plt # for making figures\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# read in all the words\n", "words = open('names.txt', 'r').read().splitlines()\n", "words[:8]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "32033" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(words)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}\n", "27\n" ] } ], "source": [ "# build the vocabulary of characters and mappings to/from integers\n", "chars = sorted(list(set(''.join(words))))\n", "stoi = {s:i+1 for i,s in enumerate(chars)}\n", "stoi['.'] = 0\n", "itos = {i:s for s,i in stoi.items()}\n", "vocab_size = len(itos)\n", "print(itos)\n", "print(vocab_size)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([182625, 3]) torch.Size([182625])\n", "torch.Size([22655, 3]) torch.Size([22655])\n", "torch.Size([22866, 3]) torch.Size([22866])\n" ] } ], "source": [ "# build the dataset\n", "block_size = 3 # context length: how many characters do we take to predict the next one?\n", "\n", "def build_dataset(words): \n", " X, Y = [], []\n", " \n", " for w in words:\n", " context = [0] * block_size\n", " for ch in w + '.':\n", " ix = stoi[ch]\n", " X.append(context)\n", " Y.append(ix)\n", " context = context[1:] + [ix] # crop and append\n", "\n", " X = torch.tensor(X)\n", " Y = torch.tensor(Y)\n", " print(X.shape, Y.shape)\n", " return X, Y\n", "\n", "import random\n", "random.seed(42)\n", "random.shuffle(words)\n", "n1 = int(0.8*len(words))\n", "n2 = int(0.9*len(words))\n", "\n", "Xtr, Ytr = build_dataset(words[:n1]) # 80%\n", "Xdev, Ydev = build_dataset(words[n1:n2]) # 10%\n", "Xte, Yte = build_dataset(words[n2:]) # 10%" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "11897\n" ] } ], "source": [ "# MLP revisited\n", "n_embd = 10 # the dimensionality of the character embedding vectors\n", "n_hidden = 200 # the number of neurons in the hidden layer of the MLP\n", "\n", "g = torch.Generator().manual_seed(2147483647) # for reproducibility\n", "C = torch.randn((vocab_size, n_embd), generator=g)\n", "W1 = torch.randn((n_embd * block_size, n_hidden), generator=g) * (5/3)/((n_embd * block_size)**0.5) #* 0.2\n", "#b1 = torch.randn(n_hidden, generator=g) * 0.01\n", "W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.01\n", "b2 = torch.randn(vocab_size, generator=g) * 0\n", "\n", "# BatchNorm parameters\n", "bngain = torch.ones((1, n_hidden))\n", "bnbias = torch.zeros((1, n_hidden))\n", "bnmean_running = torch.zeros((1, n_hidden))\n", "bnstd_running = torch.ones((1, n_hidden))\n", "\n", "parameters = [C, W1, W2, b2, bngain, bnbias]\n", "print(sum(p.nelement() for p in parameters)) # number of parameters in total\n", "for p in parameters:\n", " p.requires_grad = True" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " 0/ 200000: 3.2342\n", " 10000/ 200000: 1.8947\n", " 20000/ 200000: 1.8914\n", " 30000/ 200000: 1.9489\n", " 40000/ 200000: 2.1701\n", " 50000/ 200000: 2.0639\n", " 60000/ 200000: 2.0728\n", " 70000/ 200000: 2.3965\n", " 80000/ 200000: 2.4142\n", " 90000/ 200000: 2.2257\n", " 100000/ 200000: 2.2824\n", " 110000/ 200000: 1.8584\n", " 120000/ 200000: 2.1613\n", " 130000/ 200000: 1.9009\n", " 140000/ 200000: 1.8430\n", " 150000/ 200000: 2.3324\n", " 160000/ 200000: 2.2026\n", " 170000/ 200000: 1.6905\n", " 180000/ 200000: 1.9502\n", " 190000/ 200000: 2.0909\n" ] } ], "source": [ "# same optimization as last time\n", "max_steps = 200000\n", "batch_size = 32\n", "lossi = []\n", "\n", "for i in range(max_steps):\n", " \n", " # minibatch construct\n", " ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)\n", " Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y\n", " \n", " # forward pass\n", " emb = C[Xb] # embed the characters into vectors\n", " embcat = emb.view(emb.shape[0], -1) # concatenate the vectors\n", " hpreact = embcat @ W1 #+ b1 # hidden layer pre-activation\n", " \n", " #hpreact = bngain * (hpreact - hpreact.mean(0, keepdim=True)) / (hpreact.std(0, keepdim=True)) + bnbias #batch normalisation layer\n", " #----------------\n", " # BatchNorm layer\n", " #----------------\n", " bnmeani = hpreact.mean(0, keepdim=True)\n", " bnstdi = hpreact.std(0, keepdim=True)\n", " \n", " hpreact = bngain * (hpreact - bnmeani) / bnstdi + bnbias\n", " \n", " with torch.no_grad():\n", " bnmean_running = 0.999 * bnmean_running + 0.001 * bnmeani\n", " bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi\n", " #----------------\n", "\n", " h = torch.tanh(hpreact) # hidden layer\n", " logits = h @ W2 + b2 # output layer\n", " loss = F.cross_entropy(logits, Yb) # loss function\n", " \n", " # backward pass\n", " for p in parameters:\n", " p.grad = None\n", " loss.backward()\n", " \n", " # update\n", " lr = 0.1 if i < 100000 else 0.01 # step learning rate decay\n", " for p in parameters:\n", " p.data += -lr * p.grad\n", "\n", " # track stats\n", " if i % 10000 == 0: # print every once in a while\n", " print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')\n", " lossi.append(loss.log10().item())\n", "\n", " #break #Add this while experienting so you dont have to print all the steps" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "-----------------\n", "\n", "Used for lecture : [00:12:59](https://www.youtube.com/watch?v=P6sfmUTpUmc&t=779s) fixing the saturated tanh " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#plt.hist(h.view(-1).tolist(), 50)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#plt.hist(hpreact.view(-1).tolist(), 50)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#plt.figure(figsize=(20,10))\n", "#plt.imshow(h.abs() > 0.99, cmap='gray', interpolation='nearest')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "End of Used for lecture : [00:12:59](https://www.youtube.com/watch?v=P6sfmUTpUmc&t=779s) fixing the saturated tanh \n", "\n", "----------------" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "plt.plot(lossi)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# # calibrate the batch norm at the end of training\n", "\n", "# with torch.no_grad():\n", "# # pass the training set through\n", "# emb = C[Xtr]\n", "# embcat = emb.view(emb.shape[0], -1)\n", "# hpreact = embcat @ W1 # + b1\n", "# # measure the mean/std over the entire training set\n", "# bnmean = hpreact.mean(0, keepdim=True)\n", "# bnstd = hpreact.std(0, keepdim=True)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "train 2.037672996520996\n", "val 2.107128620147705\n" ] } ], "source": [ "@torch.no_grad() # this decorator disables gradient tracking\n", "def split_loss(split):\n", " x,y = {\n", " 'train': (Xtr, Ytr),\n", " 'val': (Xdev, Ydev),\n", " 'test': (Xte, Yte),\n", " }[split]\n", " emb = C[x] # (N, block_size, n_embd)\n", " embcat = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)\n", " hpreact = embcat @ W1 #+ b1\n", " #hpreact = bngain * (hpreact - hpreact.mean(0, keepdim=True)) / (hpreact.std(0, keepdim=True)) + bnbias #batch normalisation layer\n", " hpreact = bngain * (hpreact - bnmean_running) / bnstd_running + bnbias\n", " h = torch.tanh(hpreact) # (N, n_hidden)\n", " logits = h @ W2 + b2 # (N, vocab_size)\n", " loss = F.cross_entropy(logits, y)\n", " print(split, loss.item())\n", "\n", "split_loss('train')\n", "split_loss('val')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor(3.2958)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#The initial loss value that we expect\n", "-torch.tensor(1/27.0).log()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mora.\n", "mayah.\n", "see.\n", "mel.\n", "rylee.\n", "emmadiejd.\n", "leg.\n", "adelyn.\n", "elin.\n", "shi.\n", "jen.\n", "eden.\n", "estanar.\n", "kayziquetta.\n", "noshir.\n", "roshiriel.\n", "kendreth.\n", "konnie.\n", "casube.\n", "ged.\n" ] } ], "source": [ "# sample from the model\n", "g = torch.Generator().manual_seed(2147483647 + 10)\n", "\n", "for _ in range(20):\n", " \n", " out = []\n", " context = [0] * block_size # initialize with all ...\n", " while True:\n", " # forward pass the neural net\n", " emb = C[torch.tensor([context])] # (1,block_size,d)\n", " h = torch.tanh(emb.view(1, -1) @ W1 + b1)\n", " logits = h @ W2 + b2\n", " probs = F.softmax(logits, dim=1)\n", " # sample from the distribution\n", " ix = torch.multinomial(probs, num_samples=1, generator=g).item()\n", " context = context[1:] + [ix]\n", " out.append(ix)\n", " # if we sample the special '.' token, break\n", " if ix == 0:\n", " break\n", " \n", " print(''.join(itos[i] for i in out)) # decode and print the generated word" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 2 }