Update README.md
Browse files
README.md
CHANGED
|
@@ -18,6 +18,7 @@ A model for mapping abstract sentence descriptions to sentences that fit the des
|
|
| 18 |
from transformers import AutoTokenizer, AutoModel
|
| 19 |
import torch
|
| 20 |
from typing import List
|
|
|
|
| 21 |
|
| 22 |
def load_finetuned_model():
|
| 23 |
|
|
@@ -36,4 +37,56 @@ def encode_batch(model, tokenizer, sentences: List[str], device: str):
|
|
| 36 |
features = torch.sum(features[:,1:,:] * input_ids["attention_mask"][:,1:].unsqueeze(-1), dim=1) / torch.clamp(torch.sum(input_ids["attention_mask"][:,1:], dim=1, keepdims=True), min=1e-9)
|
| 37 |
return features
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
```
|
|
|
|
| 18 |
from transformers import AutoTokenizer, AutoModel
|
| 19 |
import torch
|
| 20 |
from typing import List
|
| 21 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 22 |
|
| 23 |
def load_finetuned_model():
|
| 24 |
|
|
|
|
| 37 |
features = torch.sum(features[:,1:,:] * input_ids["attention_mask"][:,1:].unsqueeze(-1), dim=1) / torch.clamp(torch.sum(input_ids["attention_mask"][:,1:], dim=1, keepdims=True), min=1e-9)
|
| 38 |
return features
|
| 39 |
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
Usage example:
|
| 43 |
+
|
| 44 |
+
```python
|
| 45 |
+
tokenizer, query_encoder, sentence_encoder = load_finetuned_model()
|
| 46 |
+
relevant_sentences = ["Fingersoft's parent company is the Finger Group.",
|
| 47 |
+
"WHIRC – a subsidiary company of Wright-Hennepin",
|
| 48 |
+
"CK Life Sciences International (Holdings) Inc. (), or CK Life Sciences, is a subsidiary of CK Hutchison Holdings",
|
| 49 |
+
"EM Microelectronic-Marin (subsidiary of The Swatch Group).",
|
| 50 |
+
"The company is currently a division of the corporate group Jam Industries.",
|
| 51 |
+
"Volt Technical Resources is a business unit of Volt Workforce Solutions, a subsidiary of Volt Information Sciences (currently trading over-the-counter as VISI.)."
|
| 52 |
+
]
|
| 53 |
+
|
| 54 |
+
irrelevant_sentences = ["The second company is deemed to be a subsidiary of the parent company.",
|
| 55 |
+
"The company has gone through more than one incarnation.",
|
| 56 |
+
"The company is owned by its employees.",
|
| 57 |
+
"Larger companies compete for market share by acquiring smaller companies that may own a particular market sector.",
|
| 58 |
+
"A parent company is a company that owns 51% or more voting stock in another firm (or subsidiary).",
|
| 59 |
+
"It is a holding company that provides services through its subsidiaries in the following areas: oil and gas, industrial and infrastructure, government and power."
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
all_sentences = relevant_sentences + irrelevant_sentences
|
| 63 |
+
query = "<query>: A company is a part of a larger company."
|
| 64 |
+
|
| 65 |
+
embeddings = encode_batch(sentence_encoder, tokenizer, all_sentences, "cpu").detach().cpu().numpy()
|
| 66 |
+
query_embedding = encode_batch(query_encoder, tokenizer, [query], "cpu").detach().cpu().numpy()
|
| 67 |
+
|
| 68 |
+
sims = cosine_similarity(query_embedding, embeddings)[0]
|
| 69 |
+
sentences_sims = list(zip(all_sentences, sims))
|
| 70 |
+
sentences_sims.sort(key=lambda x: x[1], reverse=True)
|
| 71 |
+
|
| 72 |
+
for s, sim in sentences_sims:
|
| 73 |
+
print(s, sim)
|
| 74 |
+
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
Expected output:
|
| 78 |
+
|
| 79 |
+
```
|
| 80 |
+
WHIRC – a subsidiary company of Wright-Hennepin 0.9396286
|
| 81 |
+
EM Microelectronic-Marin (subsidiary of The Swatch Group). 0.93929046
|
| 82 |
+
Fingersoft's parent company is the Finger Group. 0.936247
|
| 83 |
+
CK Life Sciences International (Holdings) Inc. (), or CK Life Sciences, is a subsidiary of CK Hutchison Holdings 0.9350312
|
| 84 |
+
The company is currently a division of the corporate group Jam Industries. 0.9273489
|
| 85 |
+
Volt Technical Resources is a business unit of Volt Workforce Solutions, a subsidiary of Volt Information Sciences (currently trading over-the-counter as VISI.). 0.9005086
|
| 86 |
+
The second company is deemed to be a subsidiary of the parent company. 0.6723645
|
| 87 |
+
It is a holding company that provides services through its subsidiaries in the following areas: oil and gas, industrial and infrastructure, government and power. 0.60081375
|
| 88 |
+
A parent company is a company that owns 51% or more voting stock in another firm (or subsidiary). 0.59490484
|
| 89 |
+
The company is owned by its employees. 0.55286574
|
| 90 |
+
The company has gone through more than one incarnation. 0.38889483
|
| 91 |
+
Larger companies compete for market share by acquiring smaller companies that may own a particular market sector. 0.25472647
|
| 92 |
```
|