File size: 1,046 Bytes
c891946
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 19 14:45:37 2023

@author: Hua
"""

import pandas as pd
import json
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
import numpy as np


df = pd.read_csv('RAMEmbeddings.csv')

# load gpt2 embeddings
gpt2bds = df.GPT2Embeddings  # get a pd.Series
gpt2list = [np.float32(np.array(json.loads(i))) for i in gpt2bds]  # list of embeddings

# define the search function
def search(inputs):
    # GPT2 embedding
    gpt2_model = SentenceTransformer('sembeddings/model_gpt_trained')
    embeddings = gpt2_model.encode(inputs)
    
    # calculate the similarity list to a given embedding
    sims = []
    for i in range(len(gpt2list)):
        sim = util.pytorch_cos_sim(embeddings, gpt2list[i])
        sims.append(sim.item())
    
    # find the top-5 similarity items
    sims_arr = np.array(sims, dtype=object)
    inds = np.argpartition(sims_arr, -5)[-5:]
    
    # return top 5 items
    return df.loc[inds].reset_index(drop=True)