codeShare
/

JupyterNotebooks

Model card Files Files and versions

xet

Community

codeShare commited on Sep 9, 2024

Commit

540a0c2

verified ·

1 Parent(s): a78db43

Upload sd_token_similarity_calculator.ipynb

Browse files

Files changed (1) hide show

sd_token_similarity_calculator.ipynb +179 -12

sd_token_similarity_calculator.ipynb CHANGED Viewed

@@ -116,10 +116,23 @@
       "metadata": {
         "id": "Ch9puvwKH1s3",
         "collapsed": true,
-        "cellView": "form"
       },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
@@ -128,7 +141,8 @@
         "from transformers import AutoTokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
         "\n",
-        "prompt= \"banana\" # @param {type:'string'}\n",
         "\n",
         "tokenizer_output = tokenizer(text = prompt)\n",
         "input_ids = tokenizer_output['input_ids']\n",
@@ -152,11 +166,15 @@
         "  A = R*(_A/_R)\n",
         "  name_A = 'random_A'\n",
         "\n",
-        "\n",
-        "mix_with = \"\" # @param {\"type\":\"string\",\"placeholder\":\"(optional) write something else\"}\n",
         "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
         "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
         "\n",
         "tokenizer_output = tokenizer(text = mix_with)\n",
         "input_ids = tokenizer_output['input_ids']\n",
         "id_C = input_ids[1]\n",
@@ -205,7 +223,7 @@
         "  A = (_A/_tmp)*tmp\n",
         "  #//---//\n",
         "  _A = LA.vector_norm(A, ord=2)\n",
-        "  print(f\"Tokenized prompt tensor A '{name_A}' token has been recalculated as A = _A * norm(w*A  - (1-w)*C) , where C is '{name_C}' token , for w = {w} \")\n",
         "\n",
         "#OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor\n",
         "\n",
@@ -231,6 +249,7 @@
         "\n",
         "#Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result\n",
         "\n",
         "list_size = 100 # @param {type:'number'}\n",
         "print_ID = False # @param {type:\"boolean\"}\n",
         "print_Similarity = True # @param {type:\"boolean\"}\n",
@@ -259,8 +278,7 @@
         "#Print the sorted list from above result"
       ],
       "metadata": {
-        "id": "iWeFnT1gAx6A",
-        "cellView": "form"
       },
       "execution_count": null,
       "outputs": []
@@ -270,7 +288,7 @@
       "source": [
         "# @title 💫 Compare Text encodings\n",
         "\n",
-        "prompt_A = \"\" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
         "prompt_B = \"\" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
         "use_token_padding = True # @param {type:\"boolean\"}\n",
         "\n",
@@ -283,6 +301,7 @@
         "ids_A = processor.tokenizer(text=prompt_A, padding=use_token_padding, return_tensors=\"pt\")\n",
         "text_encoding_A = model.get_text_features(**ids_A)\n",
         "\n",
         "ids_B = processor.tokenizer(text=prompt_B, padding=use_token_padding, return_tensors=\"pt\")\n",
         "text_encoding_B = model.get_text_features(**ids_B)\n",
         "\n",
@@ -296,8 +315,156 @@
       ],
       "metadata": {
         "id": "QQOjh5BvnG8M",
-        "collapsed": true,
-        "cellView": "form"
       },
       "execution_count": null,
       "outputs": []

       "metadata": {
         "id": "Ch9puvwKH1s3",
         "collapsed": true,
+        "cellView": "form",
+        "outputId": "9a9d4274-a633-464b-e1fb-06a33f3dd873",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        }
       },
+      "execution_count": 59,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "fatal: destination path 'sd_tokens' already exists and is not an empty directory.\n",
+            "/content/sd_tokens\n"
+          ]
+        }
+      ]
     },
     {
       "cell_type": "code",
         "from transformers import AutoTokenizer\n",
         "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
         "\n",
+        "# @markdown Write name of token to match against\n",
+        "prompt= \"banana\" # @param {type:'string',\"placeholder\":\"leave empty for random value token\"}\n",
         "\n",
         "tokenizer_output = tokenizer(text = prompt)\n",
         "input_ids = tokenizer_output['input_ids']\n",
         "  A = R*(_A/_R)\n",
         "  name_A = 'random_A'\n",
         "\n",
+        "# @markdown (optional) Mix the token with something else\n",
+        "mix_with = \"\" # @param {\"type\":\"string\",\"placeholder\":\"leave empty for random value token\"}\n",
         "mix_method = \"None\" # @param [\"None\" , \"Average\", \"Subtract\"] {allow-input: true}\n",
         "w = 0.5 # @param {type:\"slider\", min:0, max:1, step:0.01}\n",
         "\n",
+        "# @markdown Limit char size of included token\n",
+        "min_char_size = 3 # @param {type:\"slider\", min:0, max: 50, step:1}\n",
+        "char_range = 5 # @param {type:\"slider\", min:0, max: 50, step:1}\n",
+        "\n",
         "tokenizer_output = tokenizer(text = mix_with)\n",
         "input_ids = tokenizer_output['input_ids']\n",
         "id_C = input_ids[1]\n",
         "  A = (_A/_tmp)*tmp\n",
         "  #//---//\n",
         "  _A = LA.vector_norm(A, ord=2)\n",
+        "  print(f\"Tokenized prompt tensor A '{name_A}' token has been recalculated as A = _A*norm(w*A  - (1-w)*C) , where C is '{name_C}' token , for w = {w} \")\n",
         "\n",
         "#OPTIONAL : Add/subtract + normalize above result with another token. Leave field empty to get a random value tensor\n",
         "\n",
         "\n",
         "#Produce a list id IDs that are most similiar to the prompt ID at positiion 1 based on above result\n",
         "\n",
+        "# @markdown Set print options\n",
         "list_size = 100 # @param {type:'number'}\n",
         "print_ID = False # @param {type:\"boolean\"}\n",
         "print_Similarity = True # @param {type:\"boolean\"}\n",
         "#Print the sorted list from above result"
       ],
       "metadata": {
+        "id": "iWeFnT1gAx6A"
       },
       "execution_count": null,
       "outputs": []
       "source": [
         "# @title 💫 Compare Text encodings\n",
         "\n",
+        "prompt_A = \"banana\" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
         "prompt_B = \"\" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
         "use_token_padding = True # @param {type:\"boolean\"}\n",
         "\n",
         "ids_A = processor.tokenizer(text=prompt_A, padding=use_token_padding, return_tensors=\"pt\")\n",
         "text_encoding_A = model.get_text_features(**ids_A)\n",
         "\n",
+        "\n",
         "ids_B = processor.tokenizer(text=prompt_B, padding=use_token_padding, return_tensors=\"pt\")\n",
         "text_encoding_B = model.get_text_features(**ids_B)\n",
         "\n",
       ],
       "metadata": {
         "id": "QQOjh5BvnG8M",
+        "collapsed": true
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# @title 🪐 Find similiar prompt\n",
+        "# @markdown Prompt A to match against\n",
+        "prompt_A = \"photo of a banana\" # @param {\"type\":\"string\",\"placeholder\":\"Write a prompt\"}\n",
+        "# @markdown Set conditions for the output\n",
+        "must_start_with = \"bendy \" # @param {\"type\":\"string\",\"placeholder\":\"write a text\"}\n",
+        "must_contain = \"yellow\" # @param {\"type\":\"string\",\"placeholder\":\"write a text\"}\n",
+        "must_end_with = \" on a table\" # @param {\"type\":\"string\",\"placeholder\":\"write a text\"}\n",
+        "\n",
+        "token_B = must_contain\n",
+        "\n",
+        "# @markdown Limit the search\n",
+        "use_token_padding = True # @param {type:\"boolean\"}\n",
+        "start_search_at_ID = 12500 # @param {type:\"slider\", min:0, max: 49407, step:100}\n",
+        "search_range = 500 # @param {type:\"slider\", min:0, max: 2000, step:100}\n",
+        "restrictions = 'Suffix only' # @param [\"None\", \"Suffix only\", \"Prefix only\"]\n",
+        "\n",
+        "# @markdown Limit char size of included token\n",
+        "min_char_size = 3 # @param {type:\"slider\", min:0, max: 50, step:1}\n",
+        "char_range = 5 # @param {type:\"slider\", min:0, max: 50, step:1}\n",
+        "\n",
+        "\n",
+        "#Tokenize input B\n",
+        "from transformers import AutoTokenizer\n",
+        "tokenizer = AutoTokenizer.from_pretrained(\"openai/clip-vit-large-patch14\", clean_up_tokenization_spaces = False)\n",
+        "tokenizer_output = tokenizer(text = token_B)\n",
+        "input_ids = tokenizer_output['input_ids']\n",
+        "#-----#\n",
+        "name_B = must_contain\n",
+        "#-----#\n",
+        "\n",
+        "from transformers import  CLIPProcessor, CLIPModel\n",
+        "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-large-patch14\" , clean_up_tokenization_spaces = True)\n",
+        "model = CLIPModel.from_pretrained(\"openai/clip-vit-large-patch14\")\n",
+        "#-------#\n",
+        "ids_A = processor.tokenizer(text=prompt_A, padding=use_token_padding, return_tensors=\"pt\")\n",
+        "text_encoding_A = model.get_text_features(**ids_A)\n",
+        "A =  text_encoding_A[0]\n",
+        "_A = LA.vector_norm(A, ord=2)\n",
+        "name_A = prompt_A\n",
+        "print(f'a text_encoding was created for the prompt \"{prompt_A}\" ')\n",
+        "print('')\n",
+        "#----#\n",
+        "\n",
+        "START = start_search_at_ID\n",
+        "RANGE =  min(search_range , 49407 - start_search_at_ID)\n",
+        "\n",
+        "dots = torch.zeros(RANGE)\n",
+        "is_BC = torch.zeros(RANGE)\n",
+        "for index in range(RANGE):\n",
+        "  id_C = START + index\n",
+        "  C = token[id_C]\n",
+        "  _C = LA.vector_norm(C, ord=2)\n",
+        "  name_C = vocab[id_C]\n",
+        "\n",
+        "  # Decide if we should process prefix/suffix tokens\n",
+        "  if name_C.find('</w>')<=-1:\n",
+        "    if restrictions != \"Prefix only\":\n",
+        "      continue\n",
+        "  else:\n",
+        "    if restrictions == \"Prefix only\":\n",
+        "      continue\n",
+        "  #-----#\n",
+        "\n",
+        "  # Decide if char-size is within range\n",
+        "  if len(name_C) < min_char_size:\n",
+        "    continue\n",
+        "  if len(name_C) > min_char_size + char_range:\n",
+        "    continue\n",
+        "  #-----#\n",
+        "\n",
+        "  name_CB = must_start_with + name_C + name_B + must_end_with\n",
+        "  if restrictions == \"Prefix only\":\n",
+        "    name_CB = must_start_with +  name_C + '-' + name_B + must_end_with\n",
+        "  #-----#\n",
+        "  ids_CB = processor.tokenizer(text=name_CB, padding=use_token_padding, return_tensors=\"pt\")\n",
+        "  text_encoding_CB = model.get_text_features(**ids_CB)\n",
+        "  CB = text_encoding_CB[0]\n",
+        "  _CB =  LA.vector_norm(CB, ord=2)\n",
+        "  sim_CB = torch.dot(A,CB)/(_A*_CB)\n",
+        "  #-----#\n",
+        "  if restrictions == \"Prefix only\":\n",
+        "    result = sim_CB\n",
+        "    result = result.item()\n",
+        "    dots[index] = result\n",
+        "    continue\n",
+        "  #-----#\n",
+        "  name_BC = must_start_with + name_B + name_C + must_end_with\n",
+        "  ids_BC = processor.tokenizer(text=name_BC, padding=use_token_padding, return_tensors=\"pt\")\n",
+        "  text_encoding_BC = model.get_text_features(**ids_BC)\n",
+        "  BC = text_encoding_BC[0]\n",
+        "  _BC =  LA.vector_norm(BC, ord=2)\n",
+        "  sim_BC = torch.dot(A,BC)/(_A*_BC)\n",
+        "  #-----#\n",
+        "\n",
+        "  result = sim_CB\n",
+        "  if(sim_BC > sim_CB):\n",
+        "    is_BC[index] = 1\n",
+        "    result = sim_BC\n",
+        "\n",
+        "  #result = absolute_value(result.item())\n",
+        "  result = result.item()\n",
+        "  dots[index] = result\n",
+        "#----#\n",
+        "\n",
+        "\n",
+        "\n",
+        "sorted, indices = torch.sort(dots,dim=0 , descending=True)\n",
+        "\n",
+        "# @markdown Print options\n",
+        "list_size = 100 # @param {type:'number'}\n",
+        "print_ID = False # @param {type:\"boolean\"}\n",
+        "print_Similarity = True # @param {type:\"boolean\"}\n",
+        "print_Name = True # @param {type:\"boolean\"}\n",
+        "print_Divider = True # @param {type:\"boolean\"}\n",
+        "\n",
+        "\n",
+        "if (print_Divider):\n",
+        "  print('//---//')\n",
+        "\n",
+        "print('')\n",
+        "print(f'These token pairings within the range ID = {START} to ID = {START + RANGE} most closely match the text_encoding for the prompt \"{prompt_A}\" : ')\n",
+        "print('')\n",
+        "\n",
+        "for index in range(min(list_size,RANGE)):\n",
+        "  id = START + indices[index].item()\n",
+        "  if (print_Name):\n",
+        "    if(is_BC[index]>0):\n",
+        "      print(must_start_with +  name_B + vocab[id] + must_end_with)\n",
+        "    else:\n",
+        "      if restrictions == \"Prefix only\":\n",
+        "        print(must_start_with +   vocab[id] + '-'  + name_B + must_end_with)\n",
+        "      else:\n",
+        "        print(must_start_with +   vocab[id] + name_B + must_end_with)\n",
+        "  if (print_ID):\n",
+        "    print(f'ID = {id}') # IDs\n",
+        "  if (print_Similarity):\n",
+        "    print(f'similiarity = {round(sorted[index].item()*100,2)} %')\n",
+        "  if (print_Divider):\n",
+        "    print('--------')"
+      ],
+      "metadata": {
+        "id": "uDtcm-l8UCJk"
       },
       "execution_count": null,
       "outputs": []