Upload sd_token_similarity_calculator.ipynb
Browse files
sd_token_similarity_calculator.ipynb
CHANGED
|
@@ -17,7 +17,7 @@
|
|
| 17 |
{
|
| 18 |
"cell_type": "markdown",
|
| 19 |
"source": [
|
| 20 |
-
"This Notebook is a Stable-diffusion tool which allows you to find similiar tokens from the SD 1.5 vocab.json that you can use for text-to-image generation."
|
| 21 |
],
|
| 22 |
"metadata": {
|
| 23 |
"id": "L7JTcbOdBPfh"
|
|
@@ -26,6 +26,7 @@
|
|
| 26 |
{
|
| 27 |
"cell_type": "code",
|
| 28 |
"source": [
|
|
|
|
| 29 |
"# Load the tokens into the colab\n",
|
| 30 |
"!git clone https://huggingface.co/datasets/codeShare/sd_tokens\n",
|
| 31 |
"import torch\n",
|
|
@@ -70,6 +71,9 @@
|
|
| 70 |
" return result\n",
|
| 71 |
"#----#\n",
|
| 72 |
"\n",
|
|
|
|
|
|
|
|
|
|
| 73 |
"mix_with = \"\"\n",
|
| 74 |
"mix_method = \"None\""
|
| 75 |
],
|
|
@@ -82,29 +86,7 @@
|
|
| 82 |
{
|
| 83 |
"cell_type": "code",
|
| 84 |
"source": [
|
| 85 |
-
"#
|
| 86 |
-
"#print(token[8922].shape) #dimension of the token"
|
| 87 |
-
],
|
| 88 |
-
"metadata": {
|
| 89 |
-
"id": "S_Yh9gH_OUA1"
|
| 90 |
-
},
|
| 91 |
-
"execution_count": null,
|
| 92 |
-
"outputs": []
|
| 93 |
-
},
|
| 94 |
-
{
|
| 95 |
-
"cell_type": "markdown",
|
| 96 |
-
"source": [
|
| 97 |
-
"Get the IDs from a prompt text.\n",
|
| 98 |
-
"\n",
|
| 99 |
-
"The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens. Leave the field empty to get a random value tensor"
|
| 100 |
-
],
|
| 101 |
-
"metadata": {
|
| 102 |
-
"id": "f1-jS7YJApiO"
|
| 103 |
-
}
|
| 104 |
-
},
|
| 105 |
-
{
|
| 106 |
-
"cell_type": "code",
|
| 107 |
-
"source": [
|
| 108 |
"from transformers import AutoTokenizer\n",
|
| 109 |
"tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
|
| 110 |
"\n",
|
|
@@ -128,34 +110,46 @@
|
|
| 128 |
"#Save a copy of the tensor A\n",
|
| 129 |
"id_P = input_ids[1]\n",
|
| 130 |
"P = token[id_A]\n",
|
| 131 |
-
"_P = LA.vector_norm(A, ord=2)"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
],
|
| 133 |
"metadata": {
|
| 134 |
-
"id": "RPdkYzT2_X85"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
},
|
| 136 |
-
"execution_count":
|
| 137 |
-
"outputs": [
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
}
|
| 147 |
},
|
| 148 |
{
|
| 149 |
"cell_type": "code",
|
| 150 |
"source": [
|
|
|
|
| 151 |
"mix_with = \"\" # @param {type:'string'}\n",
|
| 152 |
"mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
|
| 153 |
"w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
|
| 154 |
"\n",
|
| 155 |
-
"
|
| 156 |
-
"
|
| 157 |
-
"
|
| 158 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
"#----#\n",
|
| 160 |
"\n",
|
| 161 |
"tokenizer_output = tokenizer(text = mix_with)\n",
|
|
@@ -187,7 +181,7 @@
|
|
| 187 |
" _A = LA.vector_norm(A, ord=2)\n",
|
| 188 |
" print(\"Tokenized prompt tensor A has been recalculated as A = (w*_A + (1-w)*_C) * norm(w*A - (1-w)*C) , where C is the tokenized prompt 'mix_with' tensor C\")\n",
|
| 189 |
"\n",
|
| 190 |
-
"
|
| 191 |
],
|
| 192 |
"metadata": {
|
| 193 |
"id": "oXbNSRSKPgRr"
|
|
@@ -195,19 +189,11 @@
|
|
| 195 |
"execution_count": null,
|
| 196 |
"outputs": []
|
| 197 |
},
|
| 198 |
-
{
|
| 199 |
-
"cell_type": "markdown",
|
| 200 |
-
"source": [
|
| 201 |
-
"Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result"
|
| 202 |
-
],
|
| 203 |
-
"metadata": {
|
| 204 |
-
"id": "3uBSZ1vWVCew"
|
| 205 |
-
}
|
| 206 |
-
},
|
| 207 |
{
|
| 208 |
"cell_type": "code",
|
| 209 |
"source": [
|
| 210 |
"\n",
|
|
|
|
| 211 |
"dots = torch.zeros(NUM_TOKENS)\n",
|
| 212 |
"for index in range(NUM_TOKENS):\n",
|
| 213 |
" id_B = index\n",
|
|
@@ -234,7 +220,9 @@
|
|
| 234 |
"if (mix_method == \"Subtract\"):\n",
|
| 235 |
" print(f'Calculated all cosine-similarities between the subtract of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
|
| 236 |
"if (mix_method == \"None\"):\n",
|
| 237 |
-
" print(f'Calculated all cosine-similarities between the token {name_A} with Id_A = {id_A} with the the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')"
|
|
|
|
|
|
|
| 238 |
],
|
| 239 |
"metadata": {
|
| 240 |
"id": "juxsvco9B0iV"
|
|
@@ -242,20 +230,11 @@
|
|
| 242 |
"execution_count": null,
|
| 243 |
"outputs": []
|
| 244 |
},
|
| 245 |
-
{
|
| 246 |
-
"cell_type": "markdown",
|
| 247 |
-
"source": [
|
| 248 |
-
"Print the sorted list from above result"
|
| 249 |
-
],
|
| 250 |
-
"metadata": {
|
| 251 |
-
"id": "y-Ig3glrVQC3"
|
| 252 |
-
}
|
| 253 |
-
},
|
| 254 |
{
|
| 255 |
"cell_type": "code",
|
| 256 |
"source": [
|
|
|
|
| 257 |
"list_size = 100 # @param {type:'number'}\n",
|
| 258 |
-
"\n",
|
| 259 |
"print_ID = False # @param {type:\"boolean\"}\n",
|
| 260 |
"print_Similarity = True # @param {type:\"boolean\"}\n",
|
| 261 |
"print_Name = True # @param {type:\"boolean\"}\n",
|
|
@@ -270,7 +249,9 @@
|
|
| 270 |
" if (print_Similarity):\n",
|
| 271 |
" print(f'similiarity = {round(sorted[index].item()*100,2)} %') # % value\n",
|
| 272 |
" if (print_Divider):\n",
|
| 273 |
-
" print('--------')"
|
|
|
|
|
|
|
| 274 |
],
|
| 275 |
"metadata": {
|
| 276 |
"id": "YIEmLAzbHeuo",
|
|
@@ -279,33 +260,19 @@
|
|
| 279 |
"execution_count": null,
|
| 280 |
"outputs": []
|
| 281 |
},
|
| 282 |
-
{
|
| 283 |
-
"cell_type": "markdown",
|
| 284 |
-
"source": [
|
| 285 |
-
"Find the most similiar Tokens for given input"
|
| 286 |
-
],
|
| 287 |
-
"metadata": {
|
| 288 |
-
"id": "qqZ5DvfLBJnw"
|
| 289 |
-
}
|
| 290 |
-
},
|
| 291 |
-
{
|
| 292 |
-
"cell_type": "markdown",
|
| 293 |
-
"source": [
|
| 294 |
-
"Valid ID ranges for id_for_token_A / id_for_token_B are between 0 and 49407"
|
| 295 |
-
],
|
| 296 |
-
"metadata": {
|
| 297 |
-
"id": "kX72bAuhOtlT"
|
| 298 |
-
}
|
| 299 |
-
},
|
| 300 |
{
|
| 301 |
"cell_type": "code",
|
| 302 |
"source": [
|
|
|
|
|
|
|
| 303 |
"id_for_token_A = 4567 # @param {type:'number'}\n",
|
| 304 |
"id_for_token_B = 4343 # @param {type:'number'}\n",
|
| 305 |
"\n",
|
| 306 |
"similarity_str = 'The similarity between tokens A and B is ' + similarity(id_for_token_A , id_for_token_B)\n",
|
| 307 |
"\n",
|
| 308 |
-
"print(similarity_str)"
|
|
|
|
|
|
|
| 309 |
],
|
| 310 |
"metadata": {
|
| 311 |
"id": "MwmOdC9cNZty"
|
|
|
|
| 17 |
{
|
| 18 |
"cell_type": "markdown",
|
| 19 |
"source": [
|
| 20 |
+
"This Notebook is a Stable-diffusion tool which allows you to find similiar tokens from the SD 1.5 vocab.json that you can use for text-to-image generation. Try this Free online SD 1.5 generator with the results: https://perchance.org/fusion-ai-image-generator"
|
| 21 |
],
|
| 22 |
"metadata": {
|
| 23 |
"id": "L7JTcbOdBPfh"
|
|
|
|
| 26 |
{
|
| 27 |
"cell_type": "code",
|
| 28 |
"source": [
|
| 29 |
+
"# @title Load/initialize values\n",
|
| 30 |
"# Load the tokens into the colab\n",
|
| 31 |
"!git clone https://huggingface.co/datasets/codeShare/sd_tokens\n",
|
| 32 |
"import torch\n",
|
|
|
|
| 71 |
" return result\n",
|
| 72 |
"#----#\n",
|
| 73 |
"\n",
|
| 74 |
+
"#print(vocab[8922]) #the vocab item for ID 8922\n",
|
| 75 |
+
"#print(token[8922].shape) #dimension of the token\n",
|
| 76 |
+
"\n",
|
| 77 |
"mix_with = \"\"\n",
|
| 78 |
"mix_method = \"None\""
|
| 79 |
],
|
|
|
|
| 86 |
{
|
| 87 |
"cell_type": "code",
|
| 88 |
"source": [
|
| 89 |
+
"# @title Tokenize prompt into IDs\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
"from transformers import AutoTokenizer\n",
|
| 91 |
"tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
|
| 92 |
"\n",
|
|
|
|
| 110 |
"#Save a copy of the tensor A\n",
|
| 111 |
"id_P = input_ids[1]\n",
|
| 112 |
"P = token[id_A]\n",
|
| 113 |
+
"_P = LA.vector_norm(A, ord=2)\n",
|
| 114 |
+
"\n",
|
| 115 |
+
"#The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens, which is why output will be [49406, ... , 49407].\n",
|
| 116 |
+
"\n",
|
| 117 |
+
"#You can leave the 'prompt' field empty to get a random value tensor. Since the tensor is random value, it will not correspond to any tensor in the vocab.json list , and this it will have no ID."
|
| 118 |
],
|
| 119 |
"metadata": {
|
| 120 |
+
"id": "RPdkYzT2_X85",
|
| 121 |
+
"colab": {
|
| 122 |
+
"base_uri": "https://localhost:8080/"
|
| 123 |
+
},
|
| 124 |
+
"outputId": "e335f5da-b26d-4eea-f854-fd646444ea14"
|
| 125 |
},
|
| 126 |
+
"execution_count": 15,
|
| 127 |
+
"outputs": [
|
| 128 |
+
{
|
| 129 |
+
"output_type": "stream",
|
| 130 |
+
"name": "stdout",
|
| 131 |
+
"text": [
|
| 132 |
+
"[49406, 8922, 49407]\n"
|
| 133 |
+
]
|
| 134 |
+
}
|
| 135 |
+
]
|
|
|
|
| 136 |
},
|
| 137 |
{
|
| 138 |
"cell_type": "code",
|
| 139 |
"source": [
|
| 140 |
+
"# @title Take the ID at index 1 from above result and modify it (optional)\n",
|
| 141 |
"mix_with = \"\" # @param {type:'string'}\n",
|
| 142 |
"mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
|
| 143 |
"w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
|
| 144 |
"\n",
|
| 145 |
+
"#------#\n",
|
| 146 |
+
"#If set to TRUE , this will use the output of this cell , tensor A, as the input of this cell the 2nd time we run it. Use this feature to mix many tokens into A\n",
|
| 147 |
+
"re_iterate_tensor_A = True # @param {\"type\":\"boolean\"}\n",
|
| 148 |
+
"if (re_iterate_tensor_A == False) :\n",
|
| 149 |
+
" #prevent re-iterating A by reading from stored copy\n",
|
| 150 |
+
" id_A = id_P\n",
|
| 151 |
+
" A = P\n",
|
| 152 |
+
" _A = _P\n",
|
| 153 |
"#----#\n",
|
| 154 |
"\n",
|
| 155 |
"tokenizer_output = tokenizer(text = mix_with)\n",
|
|
|
|
| 181 |
" _A = LA.vector_norm(A, ord=2)\n",
|
| 182 |
" print(\"Tokenized prompt tensor A has been recalculated as A = (w*_A + (1-w)*_C) * norm(w*A - (1-w)*C) , where C is the tokenized prompt 'mix_with' tensor C\")\n",
|
| 183 |
"\n",
|
| 184 |
+
"#OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor"
|
| 185 |
],
|
| 186 |
"metadata": {
|
| 187 |
"id": "oXbNSRSKPgRr"
|
|
|
|
| 189 |
"execution_count": null,
|
| 190 |
"outputs": []
|
| 191 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
{
|
| 193 |
"cell_type": "code",
|
| 194 |
"source": [
|
| 195 |
"\n",
|
| 196 |
+
"# @title Find Similiar Tokens to ID at index 1 from above result\n",
|
| 197 |
"dots = torch.zeros(NUM_TOKENS)\n",
|
| 198 |
"for index in range(NUM_TOKENS):\n",
|
| 199 |
" id_B = index\n",
|
|
|
|
| 220 |
"if (mix_method == \"Subtract\"):\n",
|
| 221 |
" print(f'Calculated all cosine-similarities between the subtract of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
|
| 222 |
"if (mix_method == \"None\"):\n",
|
| 223 |
+
" print(f'Calculated all cosine-similarities between the token {name_A} with Id_A = {id_A} with the the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')\n",
|
| 224 |
+
"\n",
|
| 225 |
+
"#Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result"
|
| 226 |
],
|
| 227 |
"metadata": {
|
| 228 |
"id": "juxsvco9B0iV"
|
|
|
|
| 230 |
"execution_count": null,
|
| 231 |
"outputs": []
|
| 232 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
{
|
| 234 |
"cell_type": "code",
|
| 235 |
"source": [
|
| 236 |
+
"# @title Print Result from the 'Similiar Tokens' list from above result\n",
|
| 237 |
"list_size = 100 # @param {type:'number'}\n",
|
|
|
|
| 238 |
"print_ID = False # @param {type:\"boolean\"}\n",
|
| 239 |
"print_Similarity = True # @param {type:\"boolean\"}\n",
|
| 240 |
"print_Name = True # @param {type:\"boolean\"}\n",
|
|
|
|
| 249 |
" if (print_Similarity):\n",
|
| 250 |
" print(f'similiarity = {round(sorted[index].item()*100,2)} %') # % value\n",
|
| 251 |
" if (print_Divider):\n",
|
| 252 |
+
" print('--------')\n",
|
| 253 |
+
"\n",
|
| 254 |
+
"#Print the sorted list from above result"
|
| 255 |
],
|
| 256 |
"metadata": {
|
| 257 |
"id": "YIEmLAzbHeuo",
|
|
|
|
| 260 |
"execution_count": null,
|
| 261 |
"outputs": []
|
| 262 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
{
|
| 264 |
"cell_type": "code",
|
| 265 |
"source": [
|
| 266 |
+
"\n",
|
| 267 |
+
"# @title Get similarity % of two token IDs\n",
|
| 268 |
"id_for_token_A = 4567 # @param {type:'number'}\n",
|
| 269 |
"id_for_token_B = 4343 # @param {type:'number'}\n",
|
| 270 |
"\n",
|
| 271 |
"similarity_str = 'The similarity between tokens A and B is ' + similarity(id_for_token_A , id_for_token_B)\n",
|
| 272 |
"\n",
|
| 273 |
+
"print(similarity_str)\n",
|
| 274 |
+
"\n",
|
| 275 |
+
"#Valid ID ranges for id_for_token_A / id_for_token_B are between 0 and 49407"
|
| 276 |
],
|
| 277 |
"metadata": {
|
| 278 |
"id": "MwmOdC9cNZty"
|