| from sentence_transformers import SentenceTransformer | |
| from transformers import AutoTokenizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| from warnings import filterwarnings | |
| filterwarnings("ignore") | |
| models = ["MPNet-base-v2", "DistilRoBERTa-v1", "MiniLM-L12-v2", "MiniLM-L6-v2"] | |
| models_info = { | |
| "MPNet-base-v2": { | |
| "model_size": "420MB", | |
| "model_url": "sentence-transformers/all-mpnet-base-v2", | |
| "efficiency": "Moderate", | |
| "chunk_size": 512 | |
| }, | |
| "DistilRoBERTa-v1": { | |
| "model_size": "263MB", | |
| "model_url": "sentence-transformers/all-distilroberta-v1", | |
| "efficiency": "High", | |
| "chunk_size": 512 | |
| }, | |
| "MiniLM-L12-v2": { | |
| "model_size": "118MB", | |
| "model_url": "sentence-transformers/all-MiniLM-L12-v2", | |
| "efficiency": "High", | |
| "chunk_size": 512 | |
| }, | |
| "MiniLM-L6-v2": { | |
| "model_size": "82MB", | |
| "model_url": "sentence-transformers/all-MiniLM-L6-v2", | |
| "efficiency": "Very High", | |
| "chunk_size": 512 | |
| } | |
| } | |
| class Description_Validator: | |
| def __init__(self, model_name=None): | |
| if model_name is None: model_name="DistilRoBERTa-v1" | |
| self.model_info = models_info[model_name] | |
| model_url = self.model_info["model_url"] | |
| self.model = SentenceTransformer(model_url) | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_url) | |
| self.chunk_size = self.model_info["chunk_size"] | |
| def tokenize_and_chunk(self, text): | |
| tokens = self.tokenizer(text, truncation=False, padding=True, add_special_tokens=False)['input_ids'] | |
| token_chunks = [tokens[i:i+self.chunk_size] for i in range(0, len(tokens), self.chunk_size)] | |
| return token_chunks | |
| def get_average_embedding(self, text): | |
| token_chunks = self.tokenize_and_chunk(text) | |
| chunk_embeddings = [] | |
| for chunk in token_chunks: | |
| chunk_embedding = self.model.encode(self.tokenizer.decode(chunk), show_progress_bar=False) | |
| chunk_embeddings.append(chunk_embedding) | |
| return np.mean(chunk_embeddings, axis=0) | |
| def similarity_score(self, desc1, desc2): | |
| embedding1 = self.get_average_embedding(desc1).reshape(1, -1) | |
| embedding2 = self.get_average_embedding(desc2).reshape(1, -1) | |
| similarity = cosine_similarity(embedding1, embedding2) | |
| return similarity[0][0] |