{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "words = open('names.txt', 'r').read().splitlines()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import torch\n", "\n", "N = torch.zeros((27, 27), dtype = torch.int32)\n", "\n", "chars = sorted(list(set(''.join(words))))\n", "\n", "stoi = {s:i+1 for i,s in enumerate(chars)}\n", "stoi['.'] = 0\n", "\n", "itos = {i:s for s,i in stoi.items()}" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "P = N.float()\n", "P /= P.sum(1, keepdim=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ ". e\n", "e m\n", "m m\n", "m a\n", "a .\n" ] } ], "source": [ "#Creating the training set of bigrams (x,y)\n", "xs, ys = [], []\n", "\n", "for word in words[:1]:\n", " chs = ['.'] + list(word) + ['.']\n", " for ch1, ch2 in zip(chs, chs[1:]):\n", " ix1 = stoi[ch1]\n", " ix2 = stoi[ch2]\n", " print(ch1, ch2)\n", " xs.append(ix1)\n", " ys.append(ix2)\n", "\n", "xs = torch.tensor(xs)\n", "ys = torch.tensor(ys)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "#Feeding these examples into a neural network\n", "import torch.nn.functional as F" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#<=========OPTIMIZATION============>" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([ 0, 5, 13, 13, 1])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "xs" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([ 5, 13, 13, 1, 0])" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ys" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# randomly initialize 27 neurons' weights. each neuron receives 27 inputs\n", "g = torch.Generator().manual_seed(2147483647)\n", "W = torch.randn((27, 27), generator=g, requires_grad=True) #Adding the third parameter here for the Backward pass (as remember in micrograd we had done the same thing)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "#FORWARD PASS\n", "xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding\n", "logits = xenc @ W # predict log-counts\n", "counts = logits.exp() # counts, equivalent to N\n", "probs = counts / counts.sum(1, keepdims=True) # probabilities for next character\n", "loss = -probs[torch.arange(5), ys].log().mean() #torch.arange(5) is basically 0 to 5(4) position, ys is from that tuple list | We calculate the probability values of that | Then we take their log values | Then we take their mean | Finally take the negative value (since NLL)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor(3.7693)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loss #This will be similar to the one we also calculated in the SUMMARY part of B-Main" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "#BACKWARD PASS\n", "W.grad = None #the gradient is first set to zero\n", "loss.backward()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([27, 27])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "W.grad.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "W.grad" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#UPDATE\n", "W.data += -0.1 * W.grad" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "--------------" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#JUST PUTTING THEM TOGETHER TO PERFORM GRADIENT DESCENT" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#ONLY RUN THIS THE FIRST TIME\n", "# randomly initialize 27 neurons' weights. each neuron receives 27 inputs\n", "g = torch.Generator().manual_seed(2147483647)\n", "W = torch.randn((27, 27), generator=g, requires_grad=True) #Adding the third parameter here for the Backward pass (as remember in micrograd we had done the same thing)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "#FORWARD PASS\n", "xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding\n", "logits = xenc @ W # predict log-counts\n", "counts = logits.exp() # counts, equivalent to N\n", "probs = counts / counts.sum(1, keepdims=True) # probabilities for next character\n", "loss = -probs[torch.arange(5), ys].log().mean() #torch.arange(5) is basically 0 to 5(4) position, ys is from that tuple list | We calculate the probability values of that | Then we take their log values | Then we take their mean | Finally take the negative value (since NLL)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3.6891887187957764\n" ] } ], "source": [ "print(loss.item()) #CHECKING THE LOSS VALUE" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "#BACKWARD PASS\n", "W.grad = None #the gradient is first set to zero\n", "loss.backward()" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "#UPDATE\n", "W.data += -0.1 * W.grad" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Yay, that worked. Noice" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "----------------" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---------------" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### **PUTTING THEM ALL TOGETHER**" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "number of examples: 228146\n" ] } ], "source": [ "# create the dataset\n", "xs, ys = [], []\n", "for w in words:\n", " chs = ['.'] + list(w) + ['.']\n", " for ch1, ch2 in zip(chs, chs[1:]):\n", " ix1 = stoi[ch1]\n", " ix2 = stoi[ch2]\n", " xs.append(ix1)\n", " ys.append(ix2)\n", "xs = torch.tensor(xs)\n", "ys = torch.tensor(ys)\n", "num = xs.nelement()\n", "print('number of examples: ', num)\n", "\n", "# initialize the 'network'\n", "g = torch.Generator().manual_seed(2147483647)\n", "W = torch.randn((27, 27), generator=g, requires_grad=True)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3.7686190605163574\n", "3.378804922103882\n", "3.1610896587371826\n", "3.0271859169006348\n", "2.9344847202301025\n", "2.867231607437134\n", "2.816654920578003\n", "2.777147054672241\n", "2.7452545166015625\n", "2.7188305854797363\n", "2.6965057849884033\n", "2.6773722171783447\n", "2.6608052253723145\n", "2.6463513374328613\n", "2.633665084838867\n", "2.622471332550049\n", "2.6125471591949463\n", "2.6037065982818604\n", "2.595794439315796\n", "2.5886802673339844\n" ] } ], "source": [ "# gradient descent\n", "for k in range(20):\n", " \n", " # forward pass\n", " xenc = F.one_hot(xs, num_classes=27).float() # input to the network: one-hot encoding\n", " logits = xenc @ W # predict log-counts\n", " counts = logits.exp() # counts, equivalent to N\n", " probs = counts / counts.sum(1, keepdims=True) # probabilities for next character\n", " loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()\n", " print(loss.item())\n", " \n", " # backward pass\n", " W.grad = None # set to zero the gradient\n", " loss.backward()\n", " \n", " # update\n", " W.data += -50 * W.grad" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "SO WE ALMOST ACHIEVED A VERY LOW LOSS VALUE. SIMILAR TO THE LOSS VALUE WE CALCULATED IN A-MAIN, WHEN WE TYPED OUR OWN NAME AND SAW HOW IT PERFORMS" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "--------" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "--------------" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finally *drumrolls*, we are going to see how sampling from this model produces the outputs (Spoiler alert: it will be the same as how we made the model manually, coz... it is the same model just that we made it using Neural nets)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "juwjde.\n", "janaqah.\n", "pxzfby.\n", "a.\n", "nn.\n" ] } ], "source": [ "# finally, sample from the 'neural net' model\n", "g = torch.Generator().manual_seed(2147483647)\n", "\n", "for i in range(5):\n", " \n", " out = []\n", " ix = 0\n", " while True:\n", " \n", " # ----------\n", " # BEFORE:\n", " #p = P[ix]\n", " # ----------\n", " # NOW:\n", " xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()\n", " logits = xenc @ W # predict log-counts\n", " counts = logits.exp() # counts, equivalent to N\n", " p = counts / counts.sum(1, keepdims=True) # probabilities for next character\n", " # ----------\n", " \n", " ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()\n", " out.append(itos[ix])\n", " if ix == 0:\n", " break\n", " print(''.join(out))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "--------" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "---------" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 2 }