codeShare
/

JupyterNotebooks

Model card Files Files and versions

xet

Community

codeShare commited on Sep 10, 2024

Commit

233ede2

verified ·

1 Parent(s): a48155a

Upload sd_token_similarity_calculator.ipynb

Browse files

Files changed (1) hide show

sd_token_similarity_calculator.ipynb +63 -303

sd_token_similarity_calculator.ipynb CHANGED Viewed

@@ -125,56 +125,53 @@
       "cell_type": "code",
       "source": [
         "# @title ⚡ Get similiar tokens\n",
         "from transformers import AutoTokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
         "\n",
         "# @markdown Write name of token to match against\n",
         "prompt= \"banana\" # @param {type:'string',\"placeholder\":\"leave empty for random value token\"}\n",
-        "\n",
-        "tokenizer_output = tokenizer(text = prompt)\n",
-        "input_ids = tokenizer_output['input_ids']\n",
-        "print(input_ids)\n",
-        "\n",
-        "\n",
-        "#The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens, which is why output will be [49406, ... , 49407].\n",
-        "\n",
-        "#You can leave the 'prompt' field empty to get a random value tensor. Since the tensor is random value, it will not correspond to any tensor in the vocab.json list , and this it will have no ID.\n",
-        "\n",
-        "id_A = input_ids[1]\n",
-        "A = token[id_A]\n",
-        "_A = LA.vector_norm(A, ord=2)\n",
-        "\n",
-        "#if no imput exists we just randomize the entire thing\n",
-        "if (prompt == \"\"):\n",
-        "  id_A = -1\n",
-        "  print(\"Tokenized prompt tensor A is a random valued tensor with no ID\")\n",
-        "  R = torch.rand(768)\n",
-        "  _R =  LA.vector_norm(R, ord=2)\n",
-        "  A = R*(_A/_R)\n",
-        "  name_A = 'random_A'\n",
-        "\n",
         "# @markdown (optional) Mix the token with something else\n",
         "mix_with = \"\" # @param {\"type\":\"string\",\"placeholder\":\"leave empty for random value token\"}\n",
         "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
         "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
-        "\n",
         "# @markdown Limit char size of included token\n",
         "min_char_size = 3 # @param {type:\"slider\", min:0, max: 50, step:1}\n",
         "char_range = 5 # @param {type:\"slider\", min:0, max: 50, step:1}\n",
         "\n",
         "tokenizer_output = tokenizer(text = mix_with)\n",
         "input_ids = tokenizer_output['input_ids']\n",
         "id_C = input_ids[1]\n",
-        "C = token[id_C]\n",
-        "_C = LA.vector_norm(C, ord=2)\n",
         "\n",
         "#if no imput exists we just randomize the entire thing\n",
         "if (mix_with == \"\"):\n",
         "  id_C = -1\n",
         "  print(\"Tokenized prompt  'mix_with' tensor C is a random valued tensor with no ID\")\n",
-        "  R = torch.rand(768)\n",
-        "  _R =  LA.vector_norm(R, ord=2)\n",
-        "  C = R*(_C/_R)\n",
         "  name_C = 'random_C'\n",
         "\n",
         "name_A = \"A of random type\"\n",
@@ -185,16 +182,7 @@
         "if (id_C>-1):\n",
         "  name_C = vocab[id_C]\n",
         "\n",
-        "# Peaks feature\n",
-        "#peaks_A = get_valleys(A)\n",
-        "#peaks_C = get_valleys(C)\n",
-        "#print(f\"The elementwise top 10 highest values for A is at indices {peaks_A}\")\n",
-        "#print(\"-------\")\n",
-        "#print(f\"The elementwise top 10 highest values for C is at indices {peaks_C}\")\n",
-        "#print(\"-------\")\n",
-        "#//------//\n",
-        "\n",
-        "print(f\"The similarity between A '{name_A}' and C '{name_C}' is {token_similarity(A, C)}\")\n",
         "\n",
         "if (mix_method ==  \"None\"):\n",
         "  print(\"No operation\")\n",
@@ -206,10 +194,9 @@
         "\n",
         "if (mix_method ==  \"Subtract\"):\n",
         "  tmp =  w*A - (1-w)*C\n",
-        "  _tmp =  LA.vector_norm(tmp, ord=2)\n",
-        "  A = (_A/_tmp)*tmp\n",
         "  #//---//\n",
-        "  _A = LA.vector_norm(A, ord=2)\n",
         "  print(f\"Tokenized prompt tensor A '{name_A}' token has been recalculated as A = _A*norm(w*A  - (1-w)*C) , where C is '{name_C}' token , for w = {w} \")\n",
         "\n",
         "#OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor\n",
@@ -217,12 +204,10 @@
         "dots = torch.zeros(NUM_TOKENS)\n",
         "for index in range(NUM_TOKENS):\n",
         "  id_B = index\n",
-        "  B = token[id_B]\n",
-        "  _B = LA.vector_norm(B, ord=2)\n",
-        "  result = torch.dot(A,B)/(_A*_B)\n",
-        "  #result = absolute_value(result.item())\n",
-        "  result = result.item()\n",
-        "  dots[index] = result\n",
         "\n",
         "\n",
         "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n",
@@ -262,11 +247,14 @@
         "  if (print_Divider):\n",
         "    print('--------')\n",
         "\n",
-        "#Print the sorted list from above result"
       ],
       "metadata": {
-        "id": "iWeFnT1gAx6A",
-        "cellView": "form"
       },
       "execution_count": null,
       "outputs": []
@@ -395,8 +383,6 @@
         "\n",
         "for index in range(RANGE):\n",
         "  id_C = START + index\n",
-        "  C = token[id_C]\n",
-        "  _C = LA.vector_norm(C, ord=2)\n",
         "  name_C = vocab[id_C]\n",
         "  is_Prefix = 0\n",
         "\n",
@@ -591,10 +577,7 @@
         "for index in range(NUM_PERMUTATIONS):\n",
         "  print(names[indices[index].item()])\n",
         "  print(f'similiarity = {round(sorted[index].item(),2)} %')\n",
-        "  print('------')\n",
-        "\n",
-        "\n",
-        "\n"
       ],
       "metadata": {
         "collapsed": true,
@@ -607,36 +590,36 @@
       "cell_type": "code",
       "source": [
         "# @title 💫 Compare Text encodings\n",
-        "\n",
         "prompt_A = \"banana\" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
-        "prompt_B = \"\" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
-        "use_token_padding = True # @param {type:\"boolean\"}\n",
-        "\n",
         "from transformers import  CLIPProcessor, CLIPModel\n",
-        "\n",
         "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
-        "\n",
         "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
-        "\n",
-        "ids_A = processor.tokenizer(text=prompt_A, padding=use_token_padding, return_tensors=\"pt\")\n",
-        "text_encoding_A = model.get_text_features(**ids_A)\n",
-        "\n",
-        "\n",
-        "ids_B = processor.tokenizer(text=prompt_B, padding=use_token_padding, return_tensors=\"pt\")\n",
-        "text_encoding_B = model.get_text_features(**ids_B)\n",
-        "\n",
-        "similarity_str =  'The similarity between the text_encoding for A:\"' + prompt_A + '\" and B: \"' + prompt_B +'\" is ' +  token_similarity(text_encoding_A[0] , text_encoding_B[0])\n",
-        "\n",
-        "\n",
-        "print(similarity_str)\n",
-        "#outputs = model(**inputs)\n",
-        "#logits_per_image = outputs.logits_per_image # this is the image-text similarity score\n",
-        "#probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities"
       ],
       "metadata": {
         "id": "QQOjh5BvnG8M",
-        "collapsed": true,
-        "cellView": "form"
       },
       "execution_count": null,
       "outputs": []
@@ -650,229 +633,6 @@
         "id": "hyK423TQCRup"
       }
     },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "#  ↓ Sub modules (use these to build your own projects) ↓"
-      ],
-      "metadata": {
-        "id": "_d8WtPgtAymM"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# @title 📝 -> 🆔 Tokenize prompt into IDs\n",
-        "from transformers import AutoTokenizer\n",
-        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
-        "\n",
-        "prompt= \"banana\" # @param {type:'string'}\n",
-        "\n",
-        "tokenizer_output = tokenizer(text = prompt)\n",
-        "input_ids = tokenizer_output['input_ids']\n",
-        "print(input_ids)\n",
-        "\n",
-        "\n",
-        "#The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens, which is why output will be [49406, ... , 49407].\n",
-        "\n",
-        "#You can leave the 'prompt' field empty to get a random value tensor. Since the tensor is random value, it will not correspond to any tensor in the vocab.json list , and this it will have no ID."
-      ],
-      "metadata": {
-        "id": "RPdkYzT2_X85",
-        "cellView": "form"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# @title 🆔->🥢 Take the ID at index 1 from above result and get its corresponding tensor value\n",
-        "\n",
-        "id_A = input_ids[1]\n",
-        "A = token[id_A]\n",
-        "_A = LA.vector_norm(A, ord=2)\n",
-        "\n",
-        "#if no imput exists we just randomize the entire thing\n",
-        "if (prompt == \"\"):\n",
-        "  id_A = -1\n",
-        "  print(\"Tokenized prompt tensor A is a random valued tensor with no ID\")\n",
-        "  R = torch.rand(768)\n",
-        "  _R =  LA.vector_norm(R, ord=2)\n",
-        "  A = R*(_A/_R)\n",
-        "\n",
-        "#Save a copy of the tensor A\n",
-        "id_P = id_A\n",
-        "P = A\n",
-        "_P = LA.vector_norm(A, ord=2)\n"
-      ],
-      "metadata": {
-        "id": "YqdiF8DIz9Wu",
-        "cellView": "form"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# @title 🥢 -> 🥢🔀 Take the ID at index 1 from above result and modify it (optional)\n",
-        "mix_with = \"\" # @param {type:'string'}\n",
-        "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
-        "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
-        "\n",
-        "#------#\n",
-        "#If set to TRUE , this will use the output of this cell , tensor A, as the input of this cell the 2nd time we run it. Use this feature to mix many tokens into A\n",
-        "re_iterate_tensor_A = True # @param {\"type\":\"boolean\"}\n",
-        "if (re_iterate_tensor_A == False) :\n",
-        "  #prevent re-iterating A by reading from stored copy\n",
-        "  id_A = id_P\n",
-        "  A = P\n",
-        "  _A = _P\n",
-        "#----#\n",
-        "\n",
-        "tokenizer_output = tokenizer(text = mix_with)\n",
-        "input_ids = tokenizer_output['input_ids']\n",
-        "id_C = input_ids[1]\n",
-        "C = token[id_C]\n",
-        "_C = LA.vector_norm(C, ord=2)\n",
-        "\n",
-        "#if no imput exists we just randomize the entire thing\n",
-        "if (mix_with == \"\"):\n",
-        "  id_C = -1\n",
-        "  print(\"Tokenized prompt  'mix_with' tensor C is a random valued tensor with no ID\")\n",
-        "  R = torch.rand(768)\n",
-        "  _R =  LA.vector_norm(R, ord=2)\n",
-        "  C = R*(_C/_R)\n",
-        "\n",
-        "if (mix_method ==  \"None\"):\n",
-        "  print(\"No operation\")\n",
-        "\n",
-        "if (mix_method ==  \"Average\"):\n",
-        "  A = w*A + (1-w)*C\n",
-        "  _A = LA.vector_norm(A, ord=2)\n",
-        "  print(\"Tokenized prompt tensor A has been recalculated as A = w*A + (1-w)*C , where C is the tokenized prompt  'mix_with' tensor C\")\n",
-        "\n",
-        "if (mix_method ==  \"Subtract\"):\n",
-        "  tmp = (A/_A) - (C/_C)\n",
-        "  _tmp = LA.vector_norm(tmp, ord=2)\n",
-        "  A = tmp*((w*_A + (1-w)*_C)/_tmp)\n",
-        "  _A = LA.vector_norm(A, ord=2)\n",
-        "  print(\"Tokenized prompt tensor A has been recalculated as A = (w*_A + (1-w)*_C) * norm(w*A - (1-w)*C) , where C is the tokenized prompt 'mix_with' tensor C\")\n",
-        "\n",
-        "#OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor"
-      ],
-      "metadata": {
-        "id": "oXbNSRSKPgRr",
-        "collapsed": true,
-        "cellView": "form"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "\n",
-        "# @title 🥢->🧾🥢 Find Similiar Tokens to ID at index 1 from above result\n",
-        "dots = torch.zeros(NUM_TOKENS)\n",
-        "for index in range(NUM_TOKENS):\n",
-        "  id_B = index\n",
-        "  B = token[id_B]\n",
-        "  _B = LA.vector_norm(B, ord=2)\n",
-        "  result = torch.dot(A,B)/(_A*_B)\n",
-        "  #result = absolute_value(result.item())\n",
-        "  result = result.item()\n",
-        "  dots[index] = result\n",
-        "\n",
-        "name_A = \"A of random type\"\n",
-        "if (id_A>-1):\n",
-        "  name_A = vocab[id_A]\n",
-        "\n",
-        "name_C = \"token C of random type\"\n",
-        "if (id_C>-1):\n",
-        "  name_C = vocab[id_C]\n",
-        "\n",
-        "\n",
-        "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n",
-        "#----#\n",
-        "if (mix_method ==  \"Average\"):\n",
-        "  print(f'Calculated all cosine-similarities between the average of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
-        "if (mix_method ==  \"Subtract\"):\n",
-        "  print(f'Calculated all cosine-similarities between the subtract of token {name_A} and {name_C} with Id_A = {id_A} and mixed Id_C = {id_C} as a 1x{sorted.shape[0]} tensor')\n",
-        "if (mix_method ==  \"None\"):\n",
-        "  print(f'Calculated all cosine-similarities between the token {name_A} with Id_A = {id_A} with the the rest of the {NUM_TOKENS} tokens as a 1x{sorted.shape[0]} tensor')\n",
-        "\n",
-        "#Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result"
-      ],
-      "metadata": {
-        "id": "juxsvco9B0iV",
-        "collapsed": true,
-        "cellView": "form"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "source": [],
-      "metadata": {
-        "id": "cYYu5C5C6MHH"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "# @title 🥢🧾 -> 🖨️ Print Result from the 'Similiar Tokens' list from above result\n",
-        "list_size = 100 # @param {type:'number'}\n",
-        "print_ID = False # @param {type:\"boolean\"}\n",
-        "print_Similarity = True # @param {type:\"boolean\"}\n",
-        "print_Name = True # @param {type:\"boolean\"}\n",
-        "print_Divider = True # @param {type:\"boolean\"}\n",
-        "\n",
-        "for index in range(list_size):\n",
-        "  id = indices[index].item()\n",
-        "  if (print_Name):\n",
-        "    print(f'{vocab[id]}') # vocab item\n",
-        "  if (print_ID):\n",
-        "    print(f'ID = {id}') # IDs\n",
-        "  if (print_Similarity):\n",
-        "    print(f'similiarity = {round(sorted[index].item()*100,2)} %') # % value\n",
-        "  if (print_Divider):\n",
-        "    print('--------')\n",
-        "\n",
-        "#Print the sorted list from above result"
-      ],
-      "metadata": {
-        "id": "YIEmLAzbHeuo",
-        "collapsed": true,
-        "cellView": "form"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "\n",
-        "# @title 🆔 Get similarity % of two token IDs\n",
-        "id_for_token_A = 4567 # @param {type:'number'}\n",
-        "id_for_token_B = 4343 # @param {type:'number'}\n",
-        "\n",
-        "similarity_str =  'The similarity between tokens A and B is ' + similarity(id_for_token_A , id_for_token_B)\n",
-        "\n",
-        "print(similarity_str)\n",
-        "\n",
-        "#Valid ID ranges for id_for_token_A / id_for_token_B are between 0 and 49407"
-      ],
-      "metadata": {
-        "id": "MwmOdC9cNZty",
-        "collapsed": true,
-        "cellView": "form"
-      },
-      "execution_count": null,
-      "outputs": []
-    },
     {
       "cell_type": "markdown",
       "source": [

       "cell_type": "code",
       "source": [
         "# @title ⚡ Get similiar tokens\n",
+        "import torch\n",
         "from transformers import AutoTokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
         "\n",
         "# @markdown Write name of token to match against\n",
         "prompt= \"banana\" # @param {type:'string',\"placeholder\":\"leave empty for random value token\"}\n",
         "# @markdown (optional) Mix the token with something else\n",
         "mix_with = \"\" # @param {\"type\":\"string\",\"placeholder\":\"leave empty for random value token\"}\n",
         "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
         "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
         "# @markdown Limit char size of included token\n",
         "min_char_size = 3 # @param {type:\"slider\", min:0, max: 50, step:1}\n",
         "char_range = 5 # @param {type:\"slider\", min:0, max: 50, step:1}\n",
         "\n",
+        "tokenizer_output = tokenizer(text = prompt)\n",
+        "input_ids = tokenizer_output['input_ids']\n",
+        "id_A = input_ids[1]\n",
+        "A = torch.tensor(token[id_A])\n",
+        "A = A/A.norm(p=2, dim=-1, keepdim=True)\n",
+        "#-----#\n",
         "tokenizer_output = tokenizer(text = mix_with)\n",
         "input_ids = tokenizer_output['input_ids']\n",
         "id_C = input_ids[1]\n",
+        "C = torch.tensor(token[id_C])\n",
+        "C = C/C.norm(p=2, dim=-1, keepdim=True)\n",
+        "#-----#\n",
+        "sim_AC = torch.dot(A,C)\n",
+        "#-----#\n",
+        "print(input_ids)\n",
+        "#-----#\n",
+        "\n",
+        "#if no imput exists we just randomize the entire thing\n",
+        "if (prompt == \"\"):\n",
+        "  id_A = -1\n",
+        "  print(\"Tokenized prompt tensor A is a random valued tensor with no ID\")\n",
+        "  R = torch.rand(A.shape)\n",
+        "  R = R/R.norm(p=2, dim=-1, keepdim=True)\n",
+        "  A = R\n",
+        "  name_A = 'random_A'\n",
         "\n",
         "#if no imput exists we just randomize the entire thing\n",
         "if (mix_with == \"\"):\n",
         "  id_C = -1\n",
         "  print(\"Tokenized prompt  'mix_with' tensor C is a random valued tensor with no ID\")\n",
+        "  R = torch.rand(A.shape)\n",
+        "  R = R/R.norm(p=2, dim=-1, keepdim=True)\n",
+        "  C = R\n",
         "  name_C = 'random_C'\n",
         "\n",
         "name_A = \"A of random type\"\n",
         "if (id_C>-1):\n",
         "  name_C = vocab[id_C]\n",
         "\n",
+        "print(f\"The similarity between A '{name_A}' and C '{name_C}' is {round(sim_AC.item()*100,2)} %\")\n",
         "\n",
         "if (mix_method ==  \"None\"):\n",
         "  print(\"No operation\")\n",
         "\n",
         "if (mix_method ==  \"Subtract\"):\n",
         "  tmp =  w*A - (1-w)*C\n",
+        "  tmp = tmp/tmp.norm(p=2, dim=-1, keepdim=True)\n",
+        "  A = tmp\n",
         "  #//---//\n",
         "  print(f\"Tokenized prompt tensor A '{name_A}' token has been recalculated as A = _A*norm(w*A  - (1-w)*C) , where C is '{name_C}' token , for w = {w} \")\n",
         "\n",
         "#OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor\n",
         "dots = torch.zeros(NUM_TOKENS)\n",
         "for index in range(NUM_TOKENS):\n",
         "  id_B = index\n",
+        "  B = torch.tensor(token[id_B])\n",
+        "  B = B/B.norm(p=2, dim=-1, keepdim=True)\n",
+        "  sim_AB = torch.dot(A,B)\n",
+        "  dots[index] = sim_AB\n",
         "\n",
         "\n",
         "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n",
         "  if (print_Divider):\n",
         "    print('--------')\n",
         "\n",
+        "#Print the sorted list from above result\n",
+        "\n",
+        "#The prompt will be enclosed with the <|start-of-text|> and <|end-of-text|> tokens, which is why output will be [49406, ... , 49407].\n",
+        "\n",
+        "#You can leave the 'prompt' field empty to get a random value tensor. Since the tensor is random value, it will not correspond to any tensor in the vocab.json list , and this it will have no ID."
       ],
       "metadata": {
+        "id": "iWeFnT1gAx6A"
       },
       "execution_count": null,
       "outputs": []
         "\n",
         "for index in range(RANGE):\n",
         "  id_C = START + index\n",
         "  name_C = vocab[id_C]\n",
         "  is_Prefix = 0\n",
         "\n",
         "for index in range(NUM_PERMUTATIONS):\n",
         "  print(names[indices[index].item()])\n",
         "  print(f'similiarity = {round(sorted[index].item(),2)} %')\n",
+        "  print('------')"
       ],
       "metadata": {
         "collapsed": true,
       "cell_type": "code",
       "source": [
         "# @title 💫 Compare Text encodings\n",
         "prompt_A = \"banana\" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
+        "prompt_B = \"bike \" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
+        "use_token_padding = True # param {type:\"boolean\"} <----- Enabled by default\n",
+        "#-----#\n",
+        "from transformers import AutoTokenizer\n",
+        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\",\n",
+        "clean_up_tokenization_spaces = False)\n",
+        "#-----#\n",
         "from transformers import  CLIPProcessor, CLIPModel\n",
         "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
         "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
+        "#----#\n",
+        "inputs = tokenizer(text = prompt_A, padding=True, return_tensors=\"pt\")\n",
+        "text_features_A = model.get_text_features(**inputs)\n",
+        "text_features_A = text_features_A / text_features_A.norm(p=2, dim=-1, keepdim=True)\n",
+        "name_A = prompt_A\n",
+        "#----#\n",
+        "inputs = tokenizer(text = prompt_B, padding=True, return_tensors=\"pt\")\n",
+        "text_features_B = model.get_text_features(**inputs)\n",
+        "text_features_B = text_features_B / text_features_B.norm(p=2, dim=-1, keepdim=True)\n",
+        "name_B = prompt_B\n",
+        "#----#\n",
+        "import torch\n",
+        "sim_AB = torch.nn.functional.cosine_similarity(text_features_A, text_features_B)\n",
+        "#----#\n",
+        "print(f'The similarity between the text_encoding for A:\"{prompt_A}\" and B: \"{prompt_B}\" is {round(sim_AB.item()*100,2)} %')"
       ],
       "metadata": {
         "id": "QQOjh5BvnG8M",
+        "collapsed": true
       },
       "execution_count": null,
       "outputs": []
         "id": "hyK423TQCRup"
       }
     },
     {
       "cell_type": "markdown",
       "source": [