{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "# Load the tokens into the colab\n",
        "!git clone https://huggingface.co/datasets/codeShare/sd_tokens\n",
        "import torch\n",
        "from torch import linalg as LA\n",
        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
        "%cd /content/sd_tokens\n",
        "token = torch.load('sd15_tensors.pt', map_location=device, weights_only=True)"
      ],
      "metadata": {
        "id": "Ch9puvwKH1s3"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "print(token[100].shape)  #dimension of the tokens"
      ],
      "metadata": {
        "id": "S_Yh9gH_OUA1"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "def absolute_value(x):\n",
        "    return max(x, -x)\n",
        "\n",
        "def similarity(id_A , id_B):\n",
        "  #Tensors\n",
        "  A = token[id_A]\n",
        "  B = token[id_B]\n",
        "\n",
        "  #Tensor vector length (2nd order, i.e (a^2 + b^2 + ....)^(1/2)\n",
        "  _A = LA.vector_norm(A, ord=2)\n",
        "  _B = LA.vector_norm(B, ord=2)\n",
        "\n",
        "  result = torch.dot(A,B)/(_A*_B)\n",
        "  similarity_pcnt = absolute_value(result.item()*100)\n",
        "\n",
        "  similarity_pcnt_aprox = round(similarity_pcnt, 3)\n",
        "\n",
        "  result = f'{similarity_pcnt_aprox} %'\n",
        "\n",
        "  return result"
      ],
      "metadata": {
        "id": "fxquCxFaUxAZ"
      },
      "execution_count": 16,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "Valid ID ranges for id_for_token_A / id_for_token_B are between 0 and 49407"
      ],
      "metadata": {
        "id": "kX72bAuhOtlT"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "id_for_token_A = 4567 # @param {type:'number'}\n",
        "id_for_token_B = 4343 # @param {type:'number'}\n",
        "\n",
        "similarity_str =  'The similarity between tokens A and B is ' + similarity(id_for_token_A , id_for_token_B)\n",
        "\n",
        "print(similarity_str)"
      ],
      "metadata": {
        "id": "MwmOdC9cNZty"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}