Spaces:
Running
Running
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| import pandas as pd | |
| import numpy as np | |
| import gzip | |
| import itertools | |
| import multiprocessing | |
| import csv | |
| import pickle | |
| import random | |
| from sklearn.metrics.pairwise import cosine_similarity as cosine | |
| from sklearn.metrics import mean_squared_error as mse | |
| from tqdm import tqdm, tqdm_notebook | |
| from multiprocessing import Manager, Pool | |
| from scipy.spatial.distance import cdist | |
| from numpy.linalg import norm | |
| from scipy.stats import spearmanr, pearsonr | |
| from functools import partial | |
| manager = Manager() | |
| similarity_list = manager.list() | |
| proteinListNew = manager.list() | |
| representation_dataframe = "" | |
| protein_names = "" | |
| # define similarity_list and proteinList as global variables | |
| representation_name = "" | |
| similarity_tasks = "" | |
| detailed_output = False | |
| def parallelSimilarity(paramList): | |
| protein_embedding_dataframe = representation_dataframe | |
| i = paramList[0] | |
| j = paramList[1] | |
| aspect = paramList[2] | |
| if j>i: | |
| protein1 = proteinListNew[i] | |
| protein2 = proteinListNew[j] | |
| if protein1 in protein_names and protein2 in protein_names: | |
| prot1vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein1")['Vector'].item()) | |
| prot2vec = np.asarray(protein_embedding_dataframe.query("Entry == @protein2")['Vector'].item()) | |
| #cosine will return in shape of input vectors first dimension | |
| cos = cosine(prot1vec.reshape(1,-1),prot2vec.reshape(1,-1)).item() | |
| manhattanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'cityblock') | |
| manhattanDistNorm = manhattanDist/(norm(prot1vec,1) + norm(prot2vec,1)) | |
| manhattanSim = 1-manhattanDistNorm.item() | |
| if (norm(prot1vec,1)==0 and norm(prot2vec,1) == 0): | |
| manhattanSim = 1.0 | |
| #print((protein1,protein2)) | |
| #print(manhattanDist) | |
| #print(norm(prot1vec,1)) | |
| #print(norm(prot2vec,1)) | |
| euclideanDist = cdist(prot1vec.reshape(1,-1), prot2vec.reshape(1,-1), 'euclidean') | |
| euclideanDistNorm = euclideanDist/(norm(prot1vec,2) + norm(prot2vec,2)) | |
| euclidianSim = 1-euclideanDistNorm.item() | |
| if (norm(prot1vec,1)==0 and norm(prot2vec,1) == 0): | |
| euclidianSim = 1.0 | |
| real = paramList[3] | |
| # To ensure real and calculated values appended to same postion they saved similtanously and then decoupled | |
| similarity_list.append((real,cos,manhattanSim ,euclidianSim)) | |
| return similarity_list | |
| def calculateCorrelationforOntology(aspect,matrix_type): | |
| print("\n\nSemantic similarity correlation calculation for aspect: " + aspect + " using matrix/dataset: " + matrix_type + " ...\n") | |
| #Clear lists before each aspect | |
| similarity_list[:] = [] | |
| proteinListNew[:] = [] | |
| similarityMatrixNameDict = {} | |
| similarityMatrixNameDict["All"] = "../data/preprocess/human_"+aspect+"_proteinSimilarityMatrix.csv" | |
| similarityMatrixNameDict["500"] = "../data/preprocess/human_"+aspect+"_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv" | |
| similarityMatrixNameDict["Sparse"] = "../data/preprocess/human_"+aspect+"_proteinSimilarityMatrix_for_highest_annotated_500_proteins.csv" | |
| similarityMatrixNameDict["200"] = "../data/preprocess/human_"+aspect+"_proteinSimilarityMatrix_for_highest_annotated_200_proteins.csv" | |
| similarityMatrixFileName = similarityMatrixNameDict[matrix_type] | |
| human_proteinSimilarityMatrix = pd.read_csv(similarityMatrixFileName) | |
| human_proteinSimilarityMatrix.set_index(human_proteinSimilarityMatrix.columns, inplace = True) | |
| proteinList = human_proteinSimilarityMatrix.columns | |
| #proteinListNew is referanced using Manager | |
| for prot in proteinList: | |
| proteinListNew.append(prot) | |
| if matrix_type == "Sparse": | |
| #sparsified_similarities = np.load("SparsifiedSimilarites_for_highest_500.npy") | |
| sparsified_similarity_coordinates = np.load("../data/auxilary_input/SparsifiedSimilarityCoordinates_"+aspect+"_for_highest_500.npy") | |
| protParamList = sparsified_similarity_coordinates | |
| else: | |
| i = range(len(proteinList)) | |
| j = range(len(proteinList)) | |
| protParamList = list(itertools.product(i,j)) | |
| protParamListNew = [] | |
| # Prepare parameters for parallel processing these parameters will be | |
| # used concurrently by different processes | |
| for tup in tqdm(protParamList): | |
| i = tup[0] | |
| j = tup[1] | |
| if matrix_type == "Sparse": | |
| protein1 = proteinListNew[i] | |
| protein2 = proteinListNew[j] | |
| real = human_proteinSimilarityMatrix.loc[protein1,protein2] | |
| tupNew = (tup[0],tup[1],aspect,real) | |
| protParamListNew.append(tupNew) | |
| else: | |
| if j > i: | |
| protein1 = proteinListNew[i] | |
| protein2 = proteinListNew[j] | |
| real = human_proteinSimilarityMatrix.loc[protein1,protein2] | |
| tupNew = (tup[0],tup[1],aspect,real) | |
| protParamListNew.append(tupNew) | |
| total_task_num=len(protParamListNew) | |
| pool = Pool() | |
| similarity_listRet = [] | |
| #parallelSimilarityPartial = partial(parallelSimilarity,protein_embedding_type) | |
| for similarity_listRet in tqdm(pool.imap_unordered(parallelSimilarity,protParamListNew), total=total_task_num , position=0, leave=True ): | |
| pass | |
| #time.sleep(0.1) | |
| pool.close() | |
| pool.join() | |
| real_distance_list = [value[0] for value in similarity_listRet] | |
| cosine_distance_list = [value[1] for value in similarity_listRet] | |
| manhattan_distance_list = [value[2] for value in similarity_listRet] | |
| euclidian_distance_list = [value[3] for value in similarity_listRet] | |
| distance_lists = [real_distance_list,cosine_distance_list,manhattan_distance_list,euclidian_distance_list] | |
| if detailed_output: | |
| report_detailed_distance_scores(representation_name,matrix_type,aspect,distance_lists) | |
| cosineCorr = spearmanr(real_distance_list, cosine_distance_list) | |
| manhattanCorr = spearmanr(real_distance_list, manhattan_distance_list) | |
| euclidianCorr = spearmanr(real_distance_list, euclidian_distance_list) | |
| #print("Cosine Correlation for "+aspect+" is " + str(cosineCorr)) | |
| #print("Manhattan Correlation for "+aspect+" is " + str(manhattanCorr)) | |
| #print("Euclidian Correlation for "+aspect+" is " + str(euclidianCorr)) | |
| return (cosineCorr,manhattanCorr,euclidianCorr) | |
| def report_detailed_distance_scores(representation_name,similarity_matrix_type,aspect,distance_lists): | |
| saveFileName = "../results/Semantic_sim_inference_detailed_distance_scores"+aspect+"_"+similarity_matrix_type+"_"+representation_name+".pkl" | |
| with open(saveFileName, "wb") as f: | |
| pickle.dump(distance_lists, f) | |
| def calculate_all_correlations(): | |
| for similarity_matrix_type in similarity_tasks: | |
| saveFileName = "../results/Semantic_sim_inference_"+similarity_matrix_type+"_"+representation_name+".csv" | |
| buffer = "Semantic Aspect,CosineSim_Correlation,CosineSim_Correlation p-value, ManhattanSim_Correlation,ManhattanSim_Correlation p-value, EuclidianSim_Correlation,EuclidianSim_Correlation p-value \n" | |
| f = open(saveFileName,'w') | |
| f.write(buffer) | |
| for aspect in ["MF","BP","CC"]: | |
| corr = calculateCorrelationforOntology(aspect,similarity_matrix_type) | |
| buffer = "" + aspect + ","+ str(round(corr[0][0],5))+ ","+ str(round(corr[0][1],5))+ ","+ str(round(corr[1][0],5))\ | |
| + ","+ str(round(corr[1][1],5))+ ","+ str(round(corr[2][0],5))+ ","+str(round(corr[2][1],5))+"\n" | |
| f = open(saveFileName,'a') | |
| f.write(buffer) | |
| f.close() | |