Automatically add EOS via Tokenizer, integrate Sentence Transformers (#2)

- Automatically add EOS via Tokenizer, integrate Sentence Transformers (4ee1aa535260ccb3eb883af36bac4314e0c3bbf7)
- Add "device_map": "auto" to automatically move the model to CUDA if possible (2f6ecfd654caa7ef022fa415b53ede194322d068)
- Remove eod_id line from README (2653833944d8c3c22a0139b6174051f7ce72879f)

Files changed (5) hide show

1_Pooling/config.json +10 -0
README.md +54 -11
config_sentence_transformers.json +8 -0
modules.json +20 -0
tokenizer.json +2 -2

1_Pooling/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "word_embedding_dimension": 1024,
+    "pooling_mode_cls_token": false,
+    "pooling_mode_mean_tokens": false,
+    "pooling_mode_max_tokens": false,
+    "pooling_mode_mean_sqrt_len_tokens": false,
+    "pooling_mode_weightedmean_tokens": false,
+    "pooling_mode_lasttoken": true,
+    "include_prompt": true
+}

README.md CHANGED Viewed

@@ -2,7 +2,11 @@
 license: apache-2.0
 base_model:
 - Qwen/Qwen3-0.6B-Base
-library_name: transformers
 ---
 # Qwen3-Embedding-0.6B
@@ -54,6 +58,47 @@ With Transformers versions earlier than 4.51.0, you may encounter the following
 KeyError: 'qwen3'
 ```
 ### Transformers Usage
 ```python
@@ -80,14 +125,6 @@ def last_token_pool(last_hidden_states: Tensor,
 def get_detailed_instruct(task_description: str, query: str) -> str:
     return f'Instruct: {task_description}\nQuery:{query}'
-def tokenize(tokenizer, input_texts, eod_id, max_length):
-    batch_dict = tokenizer(input_texts, padding=False, truncation=True, max_length=max_length-2)
-    for seq, att in zip(batch_dict["input_ids"], batch_dict["attention_mask"]):
-        seq.append(eod_id)
-        att.append(1)
-    batch_dict = tokenizer.pad(batch_dict, padding=True, return_tensors="pt")
-    return batch_dict
 # Each query must come with a one-sentence instruction that describes the task
 task = 'Given a web search query, retrieve relevant passages that answer the query'
@@ -108,11 +145,16 @@ model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B')
 # We recommend enabling flash_attention_2 for better acceleration and memory saving.
 # model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B', attn_implementation="flash_attention_2", torch_dtype=torch.float16).cuda()
-eod_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")
 max_length = 8192
 # Tokenize the input texts
-batch_dict = tokenize(tokenizer, input_texts, eod_id, max_length)
 batch_dict.to(model.device)
 outputs = model(**batch_dict)
 embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
@@ -121,6 +163,7 @@ embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_ma
 embeddings = F.normalize(embeddings, p=2, dim=1)
 scores = (embeddings[:2] @ embeddings[2:].T)
 print(scores.tolist())
 ```
 📌 **Tip**: We recommend that developers customize the `instruct` according to their specific scenarios, tasks, and languages. Our tests have shown that in most retrieval scenarios, not using an `instruct` on the query side can lead to a drop in retrieval performance by approximately 1% to 5%.

 license: apache-2.0
 base_model:
 - Qwen/Qwen3-0.6B-Base
+tags:
+- transformers
+- sentence-transformers
+- sentence-similarity
+- feature-extraction
 ---
 # Qwen3-Embedding-0.6B
 KeyError: 'qwen3'
 ```
+### Sentence Transformers Usage
+```python
+# Requires transformers>=4.51.0
+from sentence_transformers import SentenceTransformer
+# Load the model
+model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
+# We recommend enabling flash_attention_2 for better acceleration and memory saving,
+# together with setting `padding_side` to "left":
+# model = SentenceTransformer(
+#     "Qwen/Qwen3-Embedding-0.6B",
+#     model_kwargs={"attn_implementation": "flash_attention_2", "device_map": "auto"},
+#     tokenizer_kwargs={"padding_side": "left"},
+# )
+# The queries and documents to embed
+queries = [
+    "What is the capital of China?",
+    "Explain gravity",
+]
+documents = [
+    "The capital of China is Beijing.",
+    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
+]
+# Encode the queries and documents. Note that queries benefit from using a prompt
+# Here we use the prompt called "query" stored under `model.prompts`, but you can
+# also pass your own prompt via the `prompt` argument
+query_embeddings = model.encode(queries, prompt_name="query")
+document_embeddings = model.encode(documents)
+# Compute the (cosine) similarity between the query and document embeddings
+similarity = model.similarity(query_embeddings, document_embeddings)
+print(similarity)
+# tensor([[0.7646, 0.1414],
+#         [0.1355, 0.6000]])
+```
 ### Transformers Usage
 ```python
 def get_detailed_instruct(task_description: str, query: str) -> str:
     return f'Instruct: {task_description}\nQuery:{query}'
 # Each query must come with a one-sentence instruction that describes the task
 task = 'Given a web search query, retrieve relevant passages that answer the query'
 # We recommend enabling flash_attention_2 for better acceleration and memory saving.
 # model = AutoModel.from_pretrained('Qwen/Qwen3-Embedding-0.6B', attn_implementation="flash_attention_2", torch_dtype=torch.float16).cuda()
 max_length = 8192
 # Tokenize the input texts
+batch_dict = tokenizer(
+    input_texts,
+    padding=True,
+    truncation=True,
+    max_length=max_length,
+    return_tensors="pt",
+)
 batch_dict.to(model.device)
 outputs = model(**batch_dict)
 embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
 embeddings = F.normalize(embeddings, p=2, dim=1)
 scores = (embeddings[:2] @ embeddings[2:].T)
 print(scores.tolist())
+# [[0.7645568251609802, 0.14142508804798126], [0.13549736142158508, 0.5999549627304077]]
 ```
 📌 **Tip**: We recommend that developers customize the `instruct` according to their specific scenarios, tasks, and languages. Our tests have shown that in most retrieval scenarios, not using an `instruct` on the query side can lead to a drop in retrieval performance by approximately 1% to 5%.

config_sentence_transformers.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "prompts": {
+    "query": "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery:",
+    "document": ""
+  },
+  "default_prompt_name": null,
+  "similarity_fn_name": "cosine"
+}

modules.json ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
-size 11422654

 version https://git-lfs.github.com/spec/v1
+oid sha256:def76fb086971c7867b829c23a26261e38d9d74e02139253b38aeb9df8b4b50a
+size 11423705