Automatically add EOS via Tokenizer, integrate Sentence Transformers (#2)
Browse files- Automatically add EOS via Tokenizer, integrate Sentence Transformers (4ee1aa535260ccb3eb883af36bac4314e0c3bbf7)
- Add "device_map": "auto" to automatically move the model to CUDA if possible (2f6ecfd654caa7ef022fa415b53ede194322d068)
- Remove eod_id line from README (2653833944d8c3c22a0139b6174051f7ce72879f)
- 1_Pooling/config.json +10 -0
- README.md +54 -11
- config_sentence_transformers.json +8 -0
- modules.json +20 -0
- tokenizer.json +2 -2
    	
        1_Pooling/config.json
    ADDED
    
    | @@ -0,0 +1,10 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
                "word_embedding_dimension": 1024,
         | 
| 3 | 
            +
                "pooling_mode_cls_token": false,
         | 
| 4 | 
            +
                "pooling_mode_mean_tokens": false,
         | 
| 5 | 
            +
                "pooling_mode_max_tokens": false,
         | 
| 6 | 
            +
                "pooling_mode_mean_sqrt_len_tokens": false,
         | 
| 7 | 
            +
                "pooling_mode_weightedmean_tokens": false,
         | 
| 8 | 
            +
                "pooling_mode_lasttoken": true,
         | 
| 9 | 
            +
                "include_prompt": true
         | 
| 10 | 
            +
            }
         | 
    	
        README.md
    CHANGED
    
    | @@ -2,7 +2,11 @@ | |
| 2 | 
             
            license: apache-2.0
         | 
| 3 | 
             
            base_model:
         | 
| 4 | 
             
            - Qwen/Qwen3-0.6B-Base
         | 
| 5 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
| 6 | 
             
            ---
         | 
| 7 | 
             
            # Qwen3-Embedding-0.6B
         | 
| 8 |  | 
| @@ -54,6 +58,47 @@ With Transformers versions earlier than 4.51.0, you may encounter the following | |
| 54 | 
             
            KeyError: 'qwen3'
         | 
| 55 | 
             
            ```
         | 
| 56 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 57 | 
             
            ### Transformers Usage
         | 
| 58 |  | 
| 59 | 
             
            ```python
         | 
| @@ -80,14 +125,6 @@ def last_token_pool(last_hidden_states: Tensor, | |
| 80 | 
             
            def get_detailed_instruct(task_description: str, query: str) -> str:
         | 
| 81 | 
             
                return f'Instruct: {task_description}\nQuery:{query}'
         | 
| 82 |  | 
| 83 | 
            -
            def tokenize(tokenizer, input_texts, eod_id, max_length):
         | 
| 84 | 
            -
                batch_dict = tokenizer(input_texts, padding=False, truncation=True, max_length=max_length-2)
         | 
| 85 | 
            -
                for seq, att in zip(batch_dict["input_ids"], batch_dict["attention_mask"]):
         | 
| 86 | 
            -
                    seq.append(eod_id)
         | 
| 87 | 
            -
                    att.append(1)
         | 
| 88 | 
            -
                batch_dict = tokenizer.pad(batch_dict, padding=True, return_tensors="pt")
         | 
| 89 | 
            -
                return batch_dict
         | 
| 90 | 
            -
             | 
| 91 | 
             
            # Each query must come with a one-sentence instruction that describes the task
         | 
| 92 | 
             
            task = 'Given a web search query, retrieve relevant passages that answer the query'
         | 
| 93 |  | 
| @@ -108,11 +145,16 @@ model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B') | |
| 108 | 
             
            # We recommend enabling flash_attention_2 for better acceleration and memory saving.
         | 
| 109 | 
             
            # model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B', attn_implementation="flash_attention_2", torch_dtype=torch.float16).cuda()
         | 
| 110 |  | 
| 111 | 
            -
            eod_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")
         | 
| 112 | 
             
            max_length = 8192
         | 
| 113 |  | 
| 114 | 
             
            # Tokenize the input texts
         | 
| 115 | 
            -
            batch_dict =  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 116 | 
             
            batch_dict.to(model.device)
         | 
| 117 | 
             
            outputs = model(**batch_dict)
         | 
| 118 | 
             
            embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
         | 
| @@ -121,6 +163,7 @@ embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_ma | |
| 121 | 
             
            embeddings = F.normalize(embeddings, p=2, dim=1)
         | 
| 122 | 
             
            scores = (embeddings[:2] @ embeddings[2:].T)
         | 
| 123 | 
             
            print(scores.tolist())
         | 
|  | |
| 124 | 
             
            ```
         | 
| 125 | 
             
            📌 **Tip**: We recommend that developers customize the `instruct` according to their specific scenarios, tasks, and languages. Our tests have shown that in most retrieval scenarios, not using an `instruct` on the query side can lead to a drop in retrieval performance by approximately 1% to 5%.
         | 
| 126 |  | 
|  | |
| 2 | 
             
            license: apache-2.0
         | 
| 3 | 
             
            base_model:
         | 
| 4 | 
             
            - Qwen/Qwen3-0.6B-Base
         | 
| 5 | 
            +
            tags:
         | 
| 6 | 
            +
            - transformers
         | 
| 7 | 
            +
            - sentence-transformers
         | 
| 8 | 
            +
            - sentence-similarity
         | 
| 9 | 
            +
            - feature-extraction
         | 
| 10 | 
             
            ---
         | 
| 11 | 
             
            # Qwen3-Embedding-0.6B
         | 
| 12 |  | 
|  | |
| 58 | 
             
            KeyError: 'qwen3'
         | 
| 59 | 
             
            ```
         | 
| 60 |  | 
| 61 | 
            +
            ### Sentence Transformers Usage
         | 
| 62 | 
            +
             | 
| 63 | 
            +
            ```python
         | 
| 64 | 
            +
            # Requires transformers>=4.51.0
         | 
| 65 | 
            +
             | 
| 66 | 
            +
            from sentence_transformers import SentenceTransformer
         | 
| 67 | 
            +
             | 
| 68 | 
            +
            # Load the model
         | 
| 69 | 
            +
            model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
         | 
| 70 | 
            +
             | 
| 71 | 
            +
            # We recommend enabling flash_attention_2 for better acceleration and memory saving,
         | 
| 72 | 
            +
            # together with setting `padding_side` to "left":
         | 
| 73 | 
            +
            # model = SentenceTransformer(
         | 
| 74 | 
            +
            #     "Qwen/Qwen3-Embedding-0.6B",
         | 
| 75 | 
            +
            #     model_kwargs={"attn_implementation": "flash_attention_2", "device_map": "auto"},
         | 
| 76 | 
            +
            #     tokenizer_kwargs={"padding_side": "left"},
         | 
| 77 | 
            +
            # )
         | 
| 78 | 
            +
             | 
| 79 | 
            +
            # The queries and documents to embed
         | 
| 80 | 
            +
            queries = [
         | 
| 81 | 
            +
                "What is the capital of China?",
         | 
| 82 | 
            +
                "Explain gravity",
         | 
| 83 | 
            +
            ]
         | 
| 84 | 
            +
            documents = [
         | 
| 85 | 
            +
                "The capital of China is Beijing.",
         | 
| 86 | 
            +
                "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
         | 
| 87 | 
            +
            ]
         | 
| 88 | 
            +
             | 
| 89 | 
            +
            # Encode the queries and documents. Note that queries benefit from using a prompt
         | 
| 90 | 
            +
            # Here we use the prompt called "query" stored under `model.prompts`, but you can
         | 
| 91 | 
            +
            # also pass your own prompt via the `prompt` argument
         | 
| 92 | 
            +
            query_embeddings = model.encode(queries, prompt_name="query")
         | 
| 93 | 
            +
            document_embeddings = model.encode(documents)
         | 
| 94 | 
            +
             | 
| 95 | 
            +
            # Compute the (cosine) similarity between the query and document embeddings
         | 
| 96 | 
            +
            similarity = model.similarity(query_embeddings, document_embeddings)
         | 
| 97 | 
            +
            print(similarity)
         | 
| 98 | 
            +
            # tensor([[0.7646, 0.1414],
         | 
| 99 | 
            +
            #         [0.1355, 0.6000]])
         | 
| 100 | 
            +
            ```
         | 
| 101 | 
            +
             | 
| 102 | 
             
            ### Transformers Usage
         | 
| 103 |  | 
| 104 | 
             
            ```python
         | 
|  | |
| 125 | 
             
            def get_detailed_instruct(task_description: str, query: str) -> str:
         | 
| 126 | 
             
                return f'Instruct: {task_description}\nQuery:{query}'
         | 
| 127 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 128 | 
             
            # Each query must come with a one-sentence instruction that describes the task
         | 
| 129 | 
             
            task = 'Given a web search query, retrieve relevant passages that answer the query'
         | 
| 130 |  | 
|  | |
| 145 | 
             
            # We recommend enabling flash_attention_2 for better acceleration and memory saving.
         | 
| 146 | 
             
            # model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B', attn_implementation="flash_attention_2", torch_dtype=torch.float16).cuda()
         | 
| 147 |  | 
|  | |
| 148 | 
             
            max_length = 8192
         | 
| 149 |  | 
| 150 | 
             
            # Tokenize the input texts
         | 
| 151 | 
            +
            batch_dict = tokenizer(
         | 
| 152 | 
            +
                input_texts,
         | 
| 153 | 
            +
                padding=True,
         | 
| 154 | 
            +
                truncation=True,
         | 
| 155 | 
            +
                max_length=max_length,
         | 
| 156 | 
            +
                return_tensors="pt",
         | 
| 157 | 
            +
            )
         | 
| 158 | 
             
            batch_dict.to(model.device)
         | 
| 159 | 
             
            outputs = model(**batch_dict)
         | 
| 160 | 
             
            embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
         | 
|  | |
| 163 | 
             
            embeddings = F.normalize(embeddings, p=2, dim=1)
         | 
| 164 | 
             
            scores = (embeddings[:2] @ embeddings[2:].T)
         | 
| 165 | 
             
            print(scores.tolist())
         | 
| 166 | 
            +
            # [[0.7645568251609802, 0.14142508804798126], [0.13549736142158508, 0.5999549627304077]]
         | 
| 167 | 
             
            ```
         | 
| 168 | 
             
            📌 **Tip**: We recommend that developers customize the `instruct` according to their specific scenarios, tasks, and languages. Our tests have shown that in most retrieval scenarios, not using an `instruct` on the query side can lead to a drop in retrieval performance by approximately 1% to 5%.
         | 
| 169 |  | 
    	
        config_sentence_transformers.json
    ADDED
    
    | @@ -0,0 +1,8 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            {
         | 
| 2 | 
            +
              "prompts": {
         | 
| 3 | 
            +
                "query": "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery:",
         | 
| 4 | 
            +
                "document": ""
         | 
| 5 | 
            +
              },
         | 
| 6 | 
            +
              "default_prompt_name": null,
         | 
| 7 | 
            +
              "similarity_fn_name": "cosine"
         | 
| 8 | 
            +
            }
         | 
    	
        modules.json
    ADDED
    
    | @@ -0,0 +1,20 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            [
         | 
| 2 | 
            +
              {
         | 
| 3 | 
            +
                "idx": 0,
         | 
| 4 | 
            +
                "name": "0",
         | 
| 5 | 
            +
                "path": "",
         | 
| 6 | 
            +
                "type": "sentence_transformers.models.Transformer"
         | 
| 7 | 
            +
              },
         | 
| 8 | 
            +
              {
         | 
| 9 | 
            +
                "idx": 1,
         | 
| 10 | 
            +
                "name": "1",
         | 
| 11 | 
            +
                "path": "1_Pooling",
         | 
| 12 | 
            +
                "type": "sentence_transformers.models.Pooling"
         | 
| 13 | 
            +
              },
         | 
| 14 | 
            +
              {
         | 
| 15 | 
            +
                "idx": 2,
         | 
| 16 | 
            +
                "name": "2",
         | 
| 17 | 
            +
                "path": "2_Normalize",
         | 
| 18 | 
            +
                "type": "sentence_transformers.models.Normalize"
         | 
| 19 | 
            +
              }
         | 
| 20 | 
            +
            ]
         | 
    	
        tokenizer.json
    CHANGED
    
    | @@ -1,3 +1,3 @@ | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            -
            oid sha256: | 
| 3 | 
            -
            size  | 
|  | |
| 1 | 
             
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:def76fb086971c7867b829c23a26261e38d9d74e02139253b38aeb9df8b4b50a
         | 
| 3 | 
            +
            size 11423705
         | 

