codeShare
/

JupyterNotebooks

Model card Files Files and versions

xet

Community

codeShare commited on Sep 16, 2024

Commit

dadf5ec

verified ·

1 Parent(s): a976af7

Upload sd_token_similarity_calculator.ipynb

Browse files

Files changed (1) hide show

sd_token_similarity_calculator.ipynb +436 -314

sd_token_similarity_calculator.ipynb CHANGED Viewed

@@ -122,10 +122,30 @@
         "!git clone https://huggingface.co/datasets/codeShare/text-to-image-prompts\n"
       ],
       "metadata": {
-        "id": "rUXQ73IbonHY"
       },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
@@ -209,7 +229,7 @@
           "base_uri": "https://localhost:8080/"
         }
       },
-      "execution_count": 5,
       "outputs": [
         {
           "output_type": "stream",
@@ -487,7 +507,7 @@
       "metadata": {
         "id": "xc-PbIYF428y"
       },
-      "execution_count": 6,
       "outputs": []
     },
     {
@@ -541,7 +561,7 @@
           "base_uri": "https://localhost:8080/"
         }
       },
-      "execution_count": 7,
       "outputs": [
         {
           "output_type": "stream",
@@ -921,6 +941,161 @@
         }
       ]
     },
     {
       "cell_type": "code",
       "source": [
@@ -1240,330 +1415,34 @@
     {
       "cell_type": "code",
       "source": [
-        "# @title 💫 Compare Text encodings\n",
-        "prompt_A = \"banana\" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
-        "prompt_B = \"bike \" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
-        "use_token_padding = True # param {type:\"boolean\"} <----- Enabled by default\n",
-        "#-----#\n",
-        "from transformers import AutoTokenizer\n",
-        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\",\n",
-        "clean_up_tokenization_spaces = False)\n",
-        "#-----#\n",
-        "from transformers import  CLIPProcessor, CLIPModel\n",
-        "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
-        "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
-        "#----#\n",
-        "inputs = tokenizer(text = prompt_A, padding=True, return_tensors=\"pt\")\n",
-        "text_features_A = model.get_text_features(**inputs)\n",
-        "text_features_A = text_features_A / text_features_A.norm(p=2, dim=-1, keepdim=True)\n",
-        "name_A = prompt_A\n",
-        "#----#\n",
-        "inputs = tokenizer(text = prompt_B, padding=True, return_tensors=\"pt\")\n",
-        "text_features_B = model.get_text_features(**inputs)\n",
-        "text_features_B = text_features_B / text_features_B.norm(p=2, dim=-1, keepdim=True)\n",
-        "name_B = prompt_B\n",
-        "#----#\n",
-        "import torch\n",
-        "sim_AB = torch.nn.functional.cosine_similarity(text_features_A, text_features_B)\n",
-        "#----#\n",
-        "print(f'The similarity between the text_encoding for A:\"{prompt_A}\" and B: \"{prompt_B}\" is {round(sim_AB.item()*100,2)} %')"
       ],
       "metadata": {
-        "id": "QQOjh5BvnG8M",
-        "collapsed": true,
-        "cellView": "form"
       },
       "execution_count": null,
       "outputs": []
     },
     {
-      "cell_type": "code",
       "source": [
-        "# @title ⚡ Get similiar tokens (not updated yet)\n",
-        "import torch\n",
-        "from transformers import AutoTokenizer\n",
-        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
         "\n",
-        "# @markdown Write name of token to match against\n",
-        "token_name = \"banana \" # @param {type:'string',\"placeholder\":\"leave empty for random value token\"}\n",
         "\n",
-        "prompt = token_name\n",
-        "# @markdown (optional) Mix the token with something else\n",
-        "mix_with = \"\" # @param {\"type\":\"string\",\"placeholder\":\"leave empty for random value token\"}\n",
-        "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
-        "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
-        "# @markdown Limit char size of included token\n",
         "\n",
-        "min_char_size = 0 # param {type:\"slider\", min:0, max: 50, step:1}\n",
-        "char_range = 50 # param {type:\"slider\", min:0, max: 50, step:1}\n",
         "\n",
-        "tokenizer_output = tokenizer(text = prompt)\n",
-        "input_ids = tokenizer_output['input_ids']\n",
-        "id_A = input_ids[1]\n",
-        "A = torch.tensor(token[id_A])\n",
-        "A = A/A.norm(p=2, dim=-1, keepdim=True)\n",
-        "#-----#\n",
-        "tokenizer_output = tokenizer(text = mix_with)\n",
-        "input_ids = tokenizer_output['input_ids']\n",
-        "id_C = input_ids[1]\n",
-        "C = torch.tensor(token[id_C])\n",
-        "C = C/C.norm(p=2, dim=-1, keepdim=True)\n",
-        "#-----#\n",
-        "sim_AC = torch.dot(A,C)\n",
-        "#-----#\n",
-        "print(input_ids)\n",
-        "#-----#\n",
         "\n",
-        "#if no imput exists we just randomize the entire thing\n",
-        "if (prompt == \"\"):\n",
-        "  id_A = -1\n",
-        "  print(\"Tokenized prompt tensor A is a random valued tensor with no ID\")\n",
-        "  R = torch.rand(A.shape)\n",
-        "  R = R/R.norm(p=2, dim=-1, keepdim=True)\n",
-        "  A = R\n",
-        "  name_A = 'random_A'\n",
         "\n",
-        "#if no imput exists we just randomize the entire thing\n",
-        "if (mix_with == \"\"):\n",
-        "  id_C = -1\n",
-        "  print(\"Tokenized prompt  'mix_with' tensor C is a random valued tensor with no ID\")\n",
-        "  R = torch.rand(A.shape)\n",
-        "  R = R/R.norm(p=2, dim=-1, keepdim=True)\n",
-        "  C = R\n",
-        "  name_C = 'random_C'\n",
         "\n",
-        "name_A = \"A of random type\"\n",
-        "if (id_A>-1):\n",
-        "  name_A = vocab(id_A)\n",
         "\n",
-        "name_C = \"token C of random type\"\n",
-        "if (id_C>-1):\n",
-        "  name_C = vocab(id_C)\n",
-        "\n",
-        "print(f\"The similarity between A '{name_A}' and C '{name_C}' is {round(sim_AC.item()*100,2)} %\")\n",
-        "\n",
-        "if (mix_method ==  \"None\"):\n",
-        "  print(\"No operation\")\n",
-        "\n",
-        "if (mix_method ==  \"Average\"):\n",
-        "  A = w*A + (1-w)*C\n",
-        "  _A = LA.vector_norm(A, ord=2)\n",
-        "  print(f\"Tokenized prompt tensor A '{name_A}' token has been recalculated as A = w*A + (1-w)*C , where C is '{name_C}' token , for w = {w}  \")\n",
-        "\n",
-        "if (mix_method ==  \"Subtract\"):\n",
-        "  tmp =  w*A - (1-w)*C\n",
-        "  tmp = tmp/tmp.norm(p=2, dim=-1, keepdim=True)\n",
-        "  A = tmp\n",
-        "  #//---//\n",
-        "  print(f\"Tokenized prompt tensor A '{name_A}' token has been recalculated as A = _A*norm(w*A  - (1-w)*C) , where C is '{name_C}' token , for w = {w} \")\n",
-        "\n",
-        "#OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor\n",
-        "\n",
-        "dots = torch.zeros(NUM_TOKENS)\n",
-        "for index in range(NUM_TOKENS):\n",
-        "  id_B = index\n",
-        "  B = torch.tensor(token[id_B])\n",
-        "  B = B/B.norm(p=2, dim=-1, keepdim=True)\n",
-        "  sim_AB = torch.dot(A,B)\n",
-        "  dots[index] = sim_AB\n",
-        "\n",
-        "\n",
-        "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n",
-        "#----#\n",
-        "if (mix_method ==  \"Average\"):\n",
-        "  print(f'Calculated all cosine-similarities between the average of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
-        "if (mix_method ==  \"Subtract\"):\n",
-        "  print(f'Calculated all cosine-similarities between the subtract of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
-        "if (mix_method ==  \"None\"):\n",
-        "  print(f'Calculated all cosine-similarities between the token {name_A} with Id_A = {id_A} with the the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')\n",
-        "\n",
-        "#Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result\n",
-        "\n",
-        "# @markdown Set print options\n",
-        "list_size = 100 # @param {type:'number'}\n",
-        "print_ID = False # @param {type:\"boolean\"}\n",
-        "print_Similarity = True # @param {type:\"boolean\"}\n",
-        "print_Name = True # @param {type:\"boolean\"}\n",
-        "print_Divider = True # @param {type:\"boolean\"}\n",
-        "\n",
-        "\n",
-        "if (print_Divider):\n",
-        "  print('//---//')\n",
-        "\n",
-        "print('')\n",
-        "print('Here is the result : ')\n",
-        "print('')\n",
-        "\n",
-        "for index in range(list_size):\n",
-        "  id = indices[index].item()\n",
-        "  if (print_Name):\n",
-        "    print(f'{vocab(id)}') # vocab item\n",
-        "  if (print_ID):\n",
-        "    print(f'ID = {id}') # IDs\n",
-        "  if (print_Similarity):\n",
-        "    print(f'similiarity = {round(sorted[index].item()*100,2)} %')\n",
-        "  if (print_Divider):\n",
-        "    print('--------')\n",
-        "\n",
-        "#Print the sorted list from above result\n",
-        "\n",
-        "#The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens, which is why output will be [49406, ... , 49407].\n",
-        "\n",
-        "#You can leave the 'prompt' field empty to get a random value tensor. Since the tensor is random value, it will not correspond to any tensor in the vocab.json list , and this it will have no ID.\n",
-        "\n",
-        "# Save results as .db file\n",
-        "import shelve\n",
-        "VOCAB_FILENAME = 'tokens_most_similiar_to_' + name_A.replace('</w>','').strip()\n",
-        "d = shelve.open(VOCAB_FILENAME)\n",
-        "#NUM TOKENS == 49407\n",
-        "for index in range(NUM_TOKENS):\n",
-        "  #print(d[f'{index}']) #<-----Use this to read values from the .db file\n",
-        "  d[f'{index}']= vocab(indices[index].item()) #<---- write values to .db file\n",
-        "#----#\n",
-        "d.close() #close the file\n",
-        "# See this link for additional stuff to do with shelve: https://docs.python.org/3/library/shelve.html"
-      ],
-      "metadata": {
-        "id": "iWeFnT1gAx6A"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "You can write an url or upload a file locally from your device to use as reference. The image will by saved in the 'sd_tokens' folder. Note that the 'sd_tokens' folder will be deleted upon exiting this runtime."
-      ],
-      "metadata": {
-        "id": "hyK423TQCRup"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "%cd /content/\n",
-        "!git clone https://huggingface.co/datasets/codeShare/text-to-image-prompts"
-      ],
-      "metadata": {
-        "id": "GPAUFxZgPfrY"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# @title Make your own text_encodings .pt file for later use (using GPU is recommended to speed things up)\n",
-        "\n",
-        "import json\n",
-        "import pandas as pd\n",
-        "import os\n",
-        "import shelve\n",
-        "import torch\n",
-        "from safetensors.torch import save_file\n",
-        "\n",
-        "def my_mkdirs(folder):\n",
-        "  if os.path.exists(folder)==False:\n",
-        "    os.makedirs(folder)\n",
-        "\n",
-        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
-        "from transformers import AutoTokenizer\n",
-        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
-        "from transformers import  CLIPProcessor, CLIPModel\n",
-        "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
-        "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\").to(device)\n",
-        "\n",
-        "%cd /content/\n",
-        "\n",
-        "my_mkdirs('/content/text_encodings/')\n",
-        "filename = ''\n",
-        "\n",
-        "NUM_FILES = 10\n",
-        "\n",
-        "for  file_index in range(NUM_FILES + 1):\n",
-        "  if file_index <1: continue\n",
-        "  #if file_index >4: break\n",
-        "  filename = f'🧿📘 fusion-t2i-civitai-0-20-chars-mix-{file_index}'\n",
-        "  #🦜 fusion-t2i-prompt-features-1.json\n",
-        "\n",
-        "  # Read suffix.json\n",
-        "  %cd /content/text-to-image-prompts/civitai-prompts/blue/text\n",
-        "  with open(filename + '.json', 'r') as f:\n",
-        "      data = json.load(f)\n",
-        "  _df = pd.DataFrame({'count': data})['count']\n",
-        "  prompts = {\n",
-        "      key : value for key, value in _df.items()\n",
-        "  }\n",
-        "  NUM_ITEMS = int(prompts[\"0\"])\n",
-        "  #------#\n",
-        "\n",
-        "  # Calculate text_encoding for .json file contents and results as .db file\n",
-        "\n",
-        "  %cd /content/text_encodings/\n",
-        "  text_encoding_dict = {}\n",
-        "  for index in range(NUM_ITEMS + 1):\n",
-        "    inputs = tokenizer(text = '' + prompts[f'{index}'], padding=True, return_tensors=\"pt\").to(device)\n",
-        "    text_features = model.get_text_features(**inputs).to(device)\n",
-        "    text_features =  text_features/text_features.norm(p=2, dim=-1, keepdim=True).to(device)\n",
-        "    text_encoding_dict[f'{index}'] = text_features.to('cpu')\n",
-        "    save_file(text_encoding_dict, f'{filename}.safetensors')\n",
-        "  #----#\n",
-        "\n",
-        "#from safetensors.torch import load_file\n",
-        "#%cd /content/text_encodings\n",
-        "#loaded = load_file('🦜 fusion-t2i-prompt-features-1.safetensors')\n",
-        "#print(loaded[\"325\"])"
-      ],
-      "metadata": {
-        "id": "9ZiTsF9jV0TV"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# @title Download the created JSON as .zip file\n",
-        "%cd /content/\n",
-        "!zip -r /content/blue.zip /content/text-to-image-prompts/civitai-prompts/blue/text"
-      ],
-      "metadata": {
-        "id": "gX-sHZPWj4Lt"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# @title Download the created text_encodings as .zip file\n",
-        "%cd /content/\n",
-        "!zip -r /content/text-encodings.zip /content/text_encodings"
-      ],
-      "metadata": {
-        "id": "b3DUPYfskAIc"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "\n",
-        "\n",
-        "# How does this notebook work?\n",
-        "\n",
-        "Similiar vectors = similiar output in the SD 1.5 / SDXL / FLUX model\n",
-        "\n",
-        "CLIP converts the prompt text to vectors (“tensors”) , with float32 values usually ranging from -1 to 1.\n",
-        "\n",
-        "Dimensions are \\[ 1x768 ] tensors for SD 1.5 , and a \\[ 1x768 , 1x1024 ] tensor for SDXL and FLUX.\n",
-        "\n",
-        "The SD models and FLUX converts these vectors to an image.\n",
-        "\n",
-        "This notebook takes an input string , tokenizes it and matches the first token against the 49407 token vectors in the vocab.json : [https://huggingface.co/black-forest-labs/FLUX.1-dev/tree/main/tokenizer](https://www.google.com/url?q=https%3A%2F%2Fhuggingface.co%2Fblack-forest-labs%2FFLUX.1-dev%2Ftree%2Fmain%2Ftokenizer)\n",
-        "\n",
-        "It finds the “most similiar tokens” in the list. Similarity is the theta angle between the token vectors.\n",
         "\n",
         "<div>\n",
         "<img src=\"https://huggingface.co/datasets/codeShare/sd_tokens/resolve/main/cosine.jpeg\" width=\"300\"/>\n",
@@ -1956,6 +1835,249 @@
       },
       "execution_count": null,
       "outputs": []
     }
   ]
 }

         "!git clone https://huggingface.co/datasets/codeShare/text-to-image-prompts\n"
       ],
       "metadata": {
+        "id": "rUXQ73IbonHY",
+        "outputId": "aa0e25d1-f6b8-46ad-c1c1-0ccd70952cff",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
       },
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "/content\n",
+            "Cloning into 'text-to-image-prompts'...\n",
+            "remote: Enumerating objects: 478, done.\u001b[K\n",
+            "remote: Counting objects: 100% (475/475), done.\u001b[K\n",
+            "remote: Compressing objects: 100% (452/452), done.\u001b[K\n",
+            "remote: Total 478 (delta 82), reused 0 (delta 0), pack-reused 3 (from 1)\u001b[K\n",
+            "Receiving objects: 100% (478/478), 1.93 MiB | 6.96 MiB/s, done.\n",
+            "Resolving deltas: 100% (82/82), done.\n",
+            "Filtering content: 100% (95/95), 305.98 MiB | 59.56 MiB/s, done.\n"
+          ]
+        }
+      ]
     },
     {
       "cell_type": "code",
           "base_uri": "https://localhost:8080/"
         }
       },
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
       "metadata": {
         "id": "xc-PbIYF428y"
       },
+      "execution_count": null,
       "outputs": []
     },
     {
           "base_uri": "https://localhost:8080/"
         }
       },
+      "execution_count": null,
       "outputs": [
         {
           "output_type": "stream",
         }
       ]
     },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title 💫 Compare Text encodings\n",
+        "prompt_A = \"banana\" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
+        "prompt_B = \"bike \" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
+        "use_token_padding = True # param {type:\"boolean\"} <----- Enabled by default\n",
+        "#-----#\n",
+        "from transformers import AutoTokenizer\n",
+        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\",\n",
+        "clean_up_tokenization_spaces = False)\n",
+        "#-----#\n",
+        "from transformers import  CLIPProcessor, CLIPModel\n",
+        "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
+        "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
+        "#----#\n",
+        "inputs = tokenizer(text = prompt_A, padding=True, return_tensors=\"pt\")\n",
+        "text_features_A = model.get_text_features(**inputs)\n",
+        "text_features_A = text_features_A / text_features_A.norm(p=2, dim=-1, keepdim=True)\n",
+        "name_A = prompt_A\n",
+        "#----#\n",
+        "inputs = tokenizer(text = prompt_B, padding=True, return_tensors=\"pt\")\n",
+        "text_features_B = model.get_text_features(**inputs)\n",
+        "text_features_B = text_features_B / text_features_B.norm(p=2, dim=-1, keepdim=True)\n",
+        "name_B = prompt_B\n",
+        "#----#\n",
+        "import torch\n",
+        "sim_AB = torch.nn.functional.cosine_similarity(text_features_A, text_features_B)\n",
+        "#----#\n",
+        "print(f'The similarity between the text_encoding for A:\"{prompt_A}\" and B: \"{prompt_B}\" is {round(sim_AB.item()*100,2)} %')"
+      ],
+      "metadata": {
+        "id": "QQOjh5BvnG8M",
+        "collapsed": true,
+        "cellView": "form"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "You can write an url or upload a file locally from your device to use as reference. The image will by saved in the 'sd_tokens' folder. Note that the 'sd_tokens' folder will be deleted upon exiting this runtime."
+      ],
+      "metadata": {
+        "id": "hyK423TQCRup"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "%cd /content/\n",
+        "!git clone https://huggingface.co/datasets/codeShare/text-to-image-prompts"
+      ],
+      "metadata": {
+        "id": "GPAUFxZgPfrY"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title Make your own text_encodings .pt file for later use (using GPU is recommended to speed things up)\n",
+        "\n",
+        "import json\n",
+        "import pandas as pd\n",
+        "import os\n",
+        "import shelve\n",
+        "import torch\n",
+        "from safetensors.torch import save_file\n",
+        "\n",
+        "def my_mkdirs(folder):\n",
+        "  if os.path.exists(folder)==False:\n",
+        "    os.makedirs(folder)\n",
+        "\n",
+        "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
+        "from transformers import AutoTokenizer\n",
+        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
+        "from transformers import  CLIPProcessor, CLIPModel\n",
+        "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
+        "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\").to(device)\n",
+        "\n",
+        "%cd /content/\n",
+        "\n",
+        "my_mkdirs('/content/text_encodings/')\n",
+        "filename = ''\n",
+        "\n",
+        "NUM_FILES = 9\n",
+        "\n",
+        "\n",
+        "filename = '🆔👩_🦰 fusion-t2i-girl-firstname-1'\n",
+        "%cd /content/text-to-image-prompts/names/firstnames/text\n",
+        "with open(filename + '.json', 'r') as f:\n",
+        "    data = json.load(f)\n",
+        "_df = pd.DataFrame({'count': data})['count']\n",
+        "firstname = {\n",
+        "    key : value for key, value in _df.items()\n",
+        "}\n",
+        "\n",
+        "NUM_FIRSTNAME = 100901\n",
+        "\n",
+        "for  file_index in range(NUM_FILES + 1):\n",
+        "  if file_index <1: continue\n",
+        "  #if file_index >4: break\n",
+        "  filename = f'👱_♀️ fusion-t2i-lastnames-1 plugin-{file_index}'\n",
+        "  #🦜 fusion-t2i-prompt-features-1.json\n",
+        "\n",
+        "  # Read suffix.json\n",
+        "  %cd /content/text-to-image-prompts/names/lastnames/text\n",
+        "  with open(filename + '.json', 'r') as f:\n",
+        "      data = json.load(f)\n",
+        "  _df = pd.DataFrame({'count': data})['count']\n",
+        "  names = {\n",
+        "      key : firstname[f'{random.randint(2,NUM_FIRSTNAME)}'] + f'{value}' for key, value in _df.items()\n",
+        "  }\n",
+        "  NUM_ITEMS = int(prompts[\"0\"])\n",
+        "  #------#\n",
+        "\n",
+        "  # Calculate text_encoding for .json file contents and results as .db file\n",
+        "\n",
+        "  %cd /content/text_encodings/\n",
+        "  text_encoding_dict = {}\n",
+        "  for index in range(NUM_ITEMS + 1):\n",
+        "    inputs = tokenizer(text = '' + prompts[f'{index}'], padding=True, return_tensors=\"pt\").to(device)\n",
+        "    text_features = model.get_text_features(**inputs).to(device)\n",
+        "    text_features =  text_features/text_features.norm(p=2, dim=-1, keepdim=True).to(device)\n",
+        "    text_encoding_dict[f'{index}'] = text_features.to('cpu')\n",
+        "    save_file(text_encoding_dict, f'{filename}.safetensors')\n",
+        "  #----#\n",
+        "\n",
+        "#from safetensors.torch import load_file\n",
+        "#%cd /content/text_encodings\n",
+        "#loaded = load_file('🦜 fusion-t2i-prompt-features-1.safetensors')\n",
+        "#print(loaded[\"325\"])"
+      ],
+      "metadata": {
+        "id": "9ZiTsF9jV0TV"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title Download the created JSON as .zip file\n",
+        "%cd /content/\n",
+        "!zip -r /content/blue.zip /content/text-to-image-prompts/civitai-prompts/blue/text"
+      ],
+      "metadata": {
+        "id": "gX-sHZPWj4Lt"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
     {
       "cell_type": "code",
       "source": [
     {
       "cell_type": "code",
       "source": [
+        "# @title Download the created text_encodings as .zip file\n",
+        "%cd /content/\n",
+        "!zip -r /content/text-encodings.zip /content/text_encodings"
       ],
       "metadata": {
+        "id": "b3DUPYfskAIc"
       },
       "execution_count": null,
       "outputs": []
     },
     {
+      "cell_type": "markdown",
       "source": [
         "\n",
         "\n",
+        "# How does this notebook work?\n",
         "\n",
+        "Similiar vectors = similiar output in the SD 1.5 / SDXL / FLUX model\n",
         "\n",
+        "CLIP converts the prompt text to vectors (“tensors”) , with float32 values usually ranging from -1 to 1.\n",
         "\n",
+        "Dimensions are \\[ 1x768 ] tensors for SD 1.5 , and a \\[ 1x768 , 1x1024 ] tensor for SDXL and FLUX.\n",
         "\n",
+        "The SD models and FLUX converts these vectors to an image.\n",
         "\n",
+        "This notebook takes an input string , tokenizes it and matches the first token against the 49407 token vectors in the vocab.json : [https://huggingface.co/black-forest-labs/FLUX.1-dev/tree/main/tokenizer](https://www.google.com/url?q=https%3A%2F%2Fhuggingface.co%2Fblack-forest-labs%2FFLUX.1-dev%2Ftree%2Fmain%2Ftokenizer)\n",
         "\n",
+        "It finds the “most similiar tokens” in the list. Similarity is the theta angle between the token vectors.\n",
         "\n",
         "<div>\n",
         "<img src=\"https://huggingface.co/datasets/codeShare/sd_tokens/resolve/main/cosine.jpeg\" width=\"300\"/>\n",
       },
       "execution_count": null,
       "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title ⚡ Get similiar tokens (not updated yet)\n",
+        "import torch\n",
+        "from transformers import AutoTokenizer\n",
+        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
+        "\n",
+        "# @markdown Write name of token to match against\n",
+        "token_name = \"banana \" # @param {type:'string',\"placeholder\":\"leave empty for random value token\"}\n",
+        "\n",
+        "prompt = token_name\n",
+        "# @markdown (optional) Mix the token with something else\n",
+        "mix_with = \"\" # @param {\"type\":\"string\",\"placeholder\":\"leave empty for random value token\"}\n",
+        "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
+        "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
+        "# @markdown Limit char size of included token\n",
+        "\n",
+        "min_char_size = 0 # param {type:\"slider\", min:0, max: 50, step:1}\n",
+        "char_range = 50 # param {type:\"slider\", min:0, max: 50, step:1}\n",
+        "\n",
+        "tokenizer_output = tokenizer(text = prompt)\n",
+        "input_ids = tokenizer_output['input_ids']\n",
+        "id_A = input_ids[1]\n",
+        "A = torch.tensor(token[id_A])\n",
+        "A = A/A.norm(p=2, dim=-1, keepdim=True)\n",
+        "#-----#\n",
+        "tokenizer_output = tokenizer(text = mix_with)\n",
+        "input_ids = tokenizer_output['input_ids']\n",
+        "id_C = input_ids[1]\n",
+        "C = torch.tensor(token[id_C])\n",
+        "C = C/C.norm(p=2, dim=-1, keepdim=True)\n",
+        "#-----#\n",
+        "sim_AC = torch.dot(A,C)\n",
+        "#-----#\n",
+        "print(input_ids)\n",
+        "#-----#\n",
+        "\n",
+        "#if no imput exists we just randomize the entire thing\n",
+        "if (prompt == \"\"):\n",
+        "  id_A = -1\n",
+        "  print(\"Tokenized prompt tensor A is a random valued tensor with no ID\")\n",
+        "  R = torch.rand(A.shape)\n",
+        "  R = R/R.norm(p=2, dim=-1, keepdim=True)\n",
+        "  A = R\n",
+        "  name_A = 'random_A'\n",
+        "\n",
+        "#if no imput exists we just randomize the entire thing\n",
+        "if (mix_with == \"\"):\n",
+        "  id_C = -1\n",
+        "  print(\"Tokenized prompt  'mix_with' tensor C is a random valued tensor with no ID\")\n",
+        "  R = torch.rand(A.shape)\n",
+        "  R = R/R.norm(p=2, dim=-1, keepdim=True)\n",
+        "  C = R\n",
+        "  name_C = 'random_C'\n",
+        "\n",
+        "name_A = \"A of random type\"\n",
+        "if (id_A>-1):\n",
+        "  name_A = vocab(id_A)\n",
+        "\n",
+        "name_C = \"token C of random type\"\n",
+        "if (id_C>-1):\n",
+        "  name_C = vocab(id_C)\n",
+        "\n",
+        "print(f\"The similarity between A '{name_A}' and C '{name_C}' is {round(sim_AC.item()*100,2)} %\")\n",
+        "\n",
+        "if (mix_method ==  \"None\"):\n",
+        "  print(\"No operation\")\n",
+        "\n",
+        "if (mix_method ==  \"Average\"):\n",
+        "  A = w*A + (1-w)*C\n",
+        "  _A = LA.vector_norm(A, ord=2)\n",
+        "  print(f\"Tokenized prompt tensor A '{name_A}' token has been recalculated as A = w*A + (1-w)*C , where C is '{name_C}' token , for w = {w}  \")\n",
+        "\n",
+        "if (mix_method ==  \"Subtract\"):\n",
+        "  tmp =  w*A - (1-w)*C\n",
+        "  tmp = tmp/tmp.norm(p=2, dim=-1, keepdim=True)\n",
+        "  A = tmp\n",
+        "  #//---//\n",
+        "  print(f\"Tokenized prompt tensor A '{name_A}' token has been recalculated as A = _A*norm(w*A  - (1-w)*C) , where C is '{name_C}' token , for w = {w} \")\n",
+        "\n",
+        "#OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor\n",
+        "\n",
+        "dots = torch.zeros(NUM_TOKENS)\n",
+        "for index in range(NUM_TOKENS):\n",
+        "  id_B = index\n",
+        "  B = torch.tensor(token[id_B])\n",
+        "  B = B/B.norm(p=2, dim=-1, keepdim=True)\n",
+        "  sim_AB = torch.dot(A,B)\n",
+        "  dots[index] = sim_AB\n",
+        "\n",
+        "\n",
+        "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n",
+        "#----#\n",
+        "if (mix_method ==  \"Average\"):\n",
+        "  print(f'Calculated all cosine-similarities between the average of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
+        "if (mix_method ==  \"Subtract\"):\n",
+        "  print(f'Calculated all cosine-similarities between the subtract of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
+        "if (mix_method ==  \"None\"):\n",
+        "  print(f'Calculated all cosine-similarities between the token {name_A} with Id_A = {id_A} with the the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')\n",
+        "\n",
+        "#Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result\n",
+        "\n",
+        "# @markdown Set print options\n",
+        "list_size = 100 # @param {type:'number'}\n",
+        "print_ID = False # @param {type:\"boolean\"}\n",
+        "print_Similarity = True # @param {type:\"boolean\"}\n",
+        "print_Name = True # @param {type:\"boolean\"}\n",
+        "print_Divider = True # @param {type:\"boolean\"}\n",
+        "\n",
+        "\n",
+        "if (print_Divider):\n",
+        "  print('//---//')\n",
+        "\n",
+        "print('')\n",
+        "print('Here is the result : ')\n",
+        "print('')\n",
+        "\n",
+        "for index in range(list_size):\n",
+        "  id = indices[index].item()\n",
+        "  if (print_Name):\n",
+        "    print(f'{vocab(id)}') # vocab item\n",
+        "  if (print_ID):\n",
+        "    print(f'ID = {id}') # IDs\n",
+        "  if (print_Similarity):\n",
+        "    print(f'similiarity = {round(sorted[index].item()*100,2)} %')\n",
+        "  if (print_Divider):\n",
+        "    print('--------')\n",
+        "\n",
+        "#Print the sorted list from above result\n",
+        "\n",
+        "#The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens, which is why output will be [49406, ... , 49407].\n",
+        "\n",
+        "#You can leave the 'prompt' field empty to get a random value tensor. Since the tensor is random value, it will not correspond to any tensor in the vocab.json list , and this it will have no ID.\n",
+        "\n",
+        "# Save results as .db file\n",
+        "import shelve\n",
+        "VOCAB_FILENAME = 'tokens_most_similiar_to_' + name_A.replace('</w>','').strip()\n",
+        "d = shelve.open(VOCAB_FILENAME)\n",
+        "#NUM TOKENS == 49407\n",
+        "for index in range(NUM_TOKENS):\n",
+        "  #print(d[f'{index}']) #<-----Use this to read values from the .db file\n",
+        "  d[f'{index}']= vocab(indices[index].item()) #<---- write values to .db file\n",
+        "#----#\n",
+        "d.close() #close the file\n",
+        "# See this link for additional stuff to do with shelve: https://docs.python.org/3/library/shelve.html"
+      ],
+      "metadata": {
+        "id": "iWeFnT1gAx6A"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "\n",
+        "# @title Create random names from firstname and lastnames\n",
+        "import random\n",
+        "import json\n",
+        "import pandas as pd\n",
+        "import os\n",
+        "import shelve\n",
+        "import torch\n",
+        "from safetensors.torch import save_file\n",
+        "\n",
+        "def my_mkdirs(folder):\n",
+        "  if os.path.exists(folder)==False:\n",
+        "    os.makedirs(folder)\n",
+        "\n",
+        "\n",
+        "my_mkdirs('/content/female_full_names/')\n",
+        "filename = ''\n",
+        "\n",
+        "filename = '🆔👩_🦰 fusion-t2i-girl-firstname-1'\n",
+        "%cd /content/text-to-image-prompts/names/firstnames/text\n",
+        "with open(filename + '.json', 'r') as f:\n",
+        "    data = json.load(f)\n",
+        "_df = pd.DataFrame({'count': data})['count']\n",
+        "firstname = {\n",
+        "    key : value for key, value in _df.items()\n",
+        "}\n",
+        "\n",
+        "NUM_FIRSTNAME = 100901\n",
+        "\n",
+        "\n",
+        "NUM_FILES = 9\n",
+        "for  file_index in range(NUM_FILES + 1):\n",
+        "  if file_index <1: continue\n",
+        "  #if file_index >4: break\n",
+        "  filename = f'👱_♀️ fusion-t2i-lastnames-{file_index} plugin'\n",
+        "  #🦜 fusion-t2i-prompt-features-1.json\n",
+        "\n",
+        "  # Read suffix.json\n",
+        "  %cd /content/text-to-image-prompts/names/lastnames/text\n",
+        "  with open(filename + '.json', 'r') as f:\n",
+        "      data = json.load(f)\n",
+        "  _df = pd.DataFrame({'count': data})['count']\n",
+        "  names = {\n",
+        "      key : firstname[f'{random.randint(2,NUM_FIRSTNAME)}'] + ' ' f'{value}' + ' ' for key, value in _df.items()\n",
+        "  }\n",
+        "\n",
+        "  index = 0\n",
+        "\n",
+        "  for key in names:\n",
+        "    index = index + 1\n",
+        "  #-----#\n",
+        "\n",
+        "  names[f'{1}'] = f'👱_♀️female_fullnames-{file_index}'\n",
+        "  names[f'{0}'] = f'{index}'\n",
+        "\n",
+        "  txt_filename = f'👱_♀️female_fullnames-{file_index}'\n",
+        "  %cd /content/female_full_names/\n",
+        "  with open(txt_filename + '.txt', 'w') as f:\n",
+        "    f.write(str(names))\n",
+        "\n",
+        "  #files.download(f'fullnames-{file_index}.txt')\n",
+        "\n",
+        "#firstname[f'{random.randint(2,NUM_FIRSTNAME)}'] + f'{value}'\n",
+        "\n",
+        "  #------#\n",
+        "\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "JR0wl2ecj6RJ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title Download the created text_encodings as .zip file\n",
+        "%cd /content/\n",
+        "!zip -r /content/female_full_names.zip /content/female_full_names/"
+      ],
+      "metadata": {
+        "id": "IBenvYVrofil"
+      },
+      "execution_count": null,
+      "outputs": []
     }
   ]
 }