codeShare
/

JupyterNotebooks

Model card Files Files and versions

xet

Community

codeShare commited on Sep 7, 2024

Commit

551a5a7

verified ·

1 Parent(s): 33824cf

Upload sd_token_similarity_calculator.ipynb

Browse files

Files changed (1) hide show

sd_token_similarity_calculator.ipynb +49 -82

sd_token_similarity_calculator.ipynb CHANGED Viewed

@@ -17,7 +17,7 @@
     {
       "cell_type": "markdown",
       "source": [
-        "This Notebook is a Stable-diffusion tool which allows you to find similiar tokens from the SD 1.5 vocab.json that you can use for text-to-image generation."
       ],
       "metadata": {
         "id": "L7JTcbOdBPfh"
@@ -26,6 +26,7 @@
     {
       "cell_type": "code",
       "source": [
         "# Load the tokens into the colab\n",
         "!git clone https://huggingface.co/datasets/codeShare/sd_tokens\n",
         "import torch\n",
@@ -70,6 +71,9 @@
         "  return result\n",
         "#----#\n",
         "\n",
         "mix_with = \"\"\n",
         "mix_method = \"None\""
       ],
@@ -82,29 +86,7 @@
     {
       "cell_type": "code",
       "source": [
-        "#print(vocab[8922]) #the vocab item for ID 8922\n",
-        "#print(token[8922].shape)  #dimension of the token"
-      ],
-      "metadata": {
-        "id": "S_Yh9gH_OUA1"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "Get the IDs from a prompt text.\n",
-        "\n",
-        "The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens. Leave the field empty to get a random value tensor"
-      ],
-      "metadata": {
-        "id": "f1-jS7YJApiO"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
         "from transformers import AutoTokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
         "\n",
@@ -128,34 +110,46 @@
         "#Save a copy of the tensor A\n",
         "id_P = input_ids[1]\n",
         "P = token[id_A]\n",
-        "_P = LA.vector_norm(A, ord=2)"
       ],
       "metadata": {
-        "id": "RPdkYzT2_X85"
       },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor"
-      ],
-      "metadata": {
-        "id": "JKnz0aLFVGXc"
-      }
     },
     {
       "cell_type": "code",
       "source": [
         "mix_with = \"\" # @param {type:'string'}\n",
         "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
         "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
         "\n",
-        "#prevent re-iterating A by reading from stored copy\n",
-        "id_A = id_P\n",
-        "A = P\n",
-        "_A = _P\n",
         "#----#\n",
         "\n",
         "tokenizer_output = tokenizer(text = mix_with)\n",
@@ -187,7 +181,7 @@
         "  _A = LA.vector_norm(A, ord=2)\n",
         "  print(\"Tokenized prompt tensor A has been recalculated as A = (w*_A + (1-w)*_C) * norm(w*A - (1-w)*C) , where C is the tokenized prompt 'mix_with' tensor C\")\n",
         "\n",
-        "\n"
       ],
       "metadata": {
         "id": "oXbNSRSKPgRr"
@@ -195,19 +189,11 @@
       "execution_count": null,
       "outputs": []
     },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result"
-      ],
-      "metadata": {
-        "id": "3uBSZ1vWVCew"
-      }
-    },
     {
       "cell_type": "code",
       "source": [
         "\n",
         "dots = torch.zeros(NUM_TOKENS)\n",
         "for index in range(NUM_TOKENS):\n",
         "  id_B = index\n",
@@ -234,7 +220,9 @@
         "if (mix_method ==  \"Subtract\"):\n",
         "  print(f'Calculated all cosine-similarities between the subtract of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
         "if (mix_method ==  \"None\"):\n",
-        "  print(f'Calculated all cosine-similarities between the token {name_A} with Id_A = {id_A} with the the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')"
       ],
       "metadata": {
         "id": "juxsvco9B0iV"
@@ -242,20 +230,11 @@
       "execution_count": null,
       "outputs": []
     },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "Print the sorted list from above result"
-      ],
-      "metadata": {
-        "id": "y-Ig3glrVQC3"
-      }
-    },
     {
       "cell_type": "code",
       "source": [
         "list_size = 100 # @param {type:'number'}\n",
-        "\n",
         "print_ID = False # @param {type:\"boolean\"}\n",
         "print_Similarity = True # @param {type:\"boolean\"}\n",
         "print_Name = True # @param {type:\"boolean\"}\n",
@@ -270,7 +249,9 @@
         "  if (print_Similarity):\n",
         "    print(f'similiarity = {round(sorted[index].item()*100,2)} %') # % value\n",
         "  if (print_Divider):\n",
-        "    print('--------')"
       ],
       "metadata": {
         "id": "YIEmLAzbHeuo",
@@ -279,33 +260,19 @@
       "execution_count": null,
       "outputs": []
     },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "Find the most similiar Tokens for given input"
-      ],
-      "metadata": {
-        "id": "qqZ5DvfLBJnw"
-      }
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "Valid ID ranges for id_for_token_A / id_for_token_B are between 0 and 49407"
-      ],
-      "metadata": {
-        "id": "kX72bAuhOtlT"
-      }
-    },
     {
       "cell_type": "code",
       "source": [
         "id_for_token_A = 4567 # @param {type:'number'}\n",
         "id_for_token_B = 4343 # @param {type:'number'}\n",
         "\n",
         "similarity_str =  'The similarity between tokens A and B is ' + similarity(id_for_token_A , id_for_token_B)\n",
         "\n",
-        "print(similarity_str)"
       ],
       "metadata": {
         "id": "MwmOdC9cNZty"

     {
       "cell_type": "markdown",
       "source": [
+        "This Notebook is a Stable-diffusion tool which allows you to find similiar tokens from the SD 1.5 vocab.json that you can use for text-to-image generation. Try this Free online SD 1.5 generator with the results: https://perchance.org/fusion-ai-image-generator"
       ],
       "metadata": {
         "id": "L7JTcbOdBPfh"
     {
       "cell_type": "code",
       "source": [
+        "# @title Load/initialize values\n",
         "# Load the tokens into the colab\n",
         "!git clone https://huggingface.co/datasets/codeShare/sd_tokens\n",
         "import torch\n",
         "  return result\n",
         "#----#\n",
         "\n",
+        "#print(vocab[8922]) #the vocab item for ID 8922\n",
+        "#print(token[8922].shape)  #dimension of the token\n",
+        "\n",
         "mix_with = \"\"\n",
         "mix_method = \"None\""
       ],
     {
       "cell_type": "code",
       "source": [
+        "# @title Tokenize prompt into IDs\n",
         "from transformers import AutoTokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
         "\n",
         "#Save a copy of the tensor A\n",
         "id_P = input_ids[1]\n",
         "P = token[id_A]\n",
+        "_P = LA.vector_norm(A, ord=2)\n",
+        "\n",
+        "#The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens, which is why output will be [49406, ... , 49407].\n",
+        "\n",
+        "#You can leave the 'prompt' field empty to get a random value tensor. Since the tensor is random value, it will not correspond to any tensor in the vocab.json list , and this it will have no ID."
       ],
       "metadata": {
+        "id": "RPdkYzT2_X85",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "e335f5da-b26d-4eea-f854-fd646444ea14"
       },
+      "execution_count": 15,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "[49406, 8922, 49407]\n"
+          ]
+        }
+      ]
     },
     {
       "cell_type": "code",
       "source": [
+        "# @title Take the ID at index 1 from above result and modify it (optional)\n",
         "mix_with = \"\" # @param {type:'string'}\n",
         "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
         "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
         "\n",
+        "#------#\n",
+        "#If set to TRUE , this will use the output of this cell , tensor A, as the input of this cell the 2nd time we run it. Use this feature to mix many tokens into A\n",
+        "re_iterate_tensor_A = True # @param {\"type\":\"boolean\"}\n",
+        "if (re_iterate_tensor_A == False) :\n",
+        "  #prevent re-iterating A by reading from stored copy\n",
+        "  id_A = id_P\n",
+        "  A = P\n",
+        "  _A = _P\n",
         "#----#\n",
         "\n",
         "tokenizer_output = tokenizer(text = mix_with)\n",
         "  _A = LA.vector_norm(A, ord=2)\n",
         "  print(\"Tokenized prompt tensor A has been recalculated as A = (w*_A + (1-w)*_C) * norm(w*A - (1-w)*C) , where C is the tokenized prompt 'mix_with' tensor C\")\n",
         "\n",
+        "#OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor"
       ],
       "metadata": {
         "id": "oXbNSRSKPgRr"
       "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
         "\n",
+        "# @title Find Similiar Tokens to ID at index 1 from above result\n",
         "dots = torch.zeros(NUM_TOKENS)\n",
         "for index in range(NUM_TOKENS):\n",
         "  id_B = index\n",
         "if (mix_method ==  \"Subtract\"):\n",
         "  print(f'Calculated all cosine-similarities between the subtract of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
         "if (mix_method ==  \"None\"):\n",
+        "  print(f'Calculated all cosine-similarities between the token {name_A} with Id_A = {id_A} with the the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')\n",
+        "\n",
+        "#Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result"
       ],
       "metadata": {
         "id": "juxsvco9B0iV"
       "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
+        "# @title Print Result from the 'Similiar Tokens' list from above result\n",
         "list_size = 100 # @param {type:'number'}\n",
         "print_ID = False # @param {type:\"boolean\"}\n",
         "print_Similarity = True # @param {type:\"boolean\"}\n",
         "print_Name = True # @param {type:\"boolean\"}\n",
         "  if (print_Similarity):\n",
         "    print(f'similiarity = {round(sorted[index].item()*100,2)} %') # % value\n",
         "  if (print_Divider):\n",
+        "    print('--------')\n",
+        "\n",
+        "#Print the sorted list from above result"
       ],
       "metadata": {
         "id": "YIEmLAzbHeuo",
       "execution_count": null,
       "outputs": []
     },
     {
       "cell_type": "code",
       "source": [
+        "\n",
+        "# @title Get similarity % of two token IDs\n",
         "id_for_token_A = 4567 # @param {type:'number'}\n",
         "id_for_token_B = 4343 # @param {type:'number'}\n",
         "\n",
         "similarity_str =  'The similarity between tokens A and B is ' + similarity(id_for_token_A , id_for_token_B)\n",
         "\n",
+        "print(similarity_str)\n",
+        "\n",
+        "#Valid ID ranges for id_for_token_A / id_for_token_B are between 0 and 49407"
       ],
       "metadata": {
         "id": "MwmOdC9cNZty"