Spaces:
Runtime error
Runtime error
| from calendar import c | |
| from transformers import AlbertForMaskedLM, AlbertTokenizer, pipeline | |
| import numpy as np | |
| import gradio as gr | |
| ## Load the model | |
| tokenizer = AlbertTokenizer.from_pretrained("Rostlab/prot_albert", do_lower_case=False ) | |
| model = AlbertForMaskedLM.from_pretrained("Rostlab/prot_albert") | |
| #pipeline | |
| fill_mask = pipeline("fill-mask", model = model, tokenizer=tokenizer, top_k = 21) | |
| ## Initialization | |
| header=['SeqNo','PDB','No','V','L','I','M','F','W','Y','G','A','P','S','T','C','H','R','K','Q','E','N','D','NOCC','NDEL','NINS','ENTROPY','RELENT','WEIGHT','CHAIN','AUTHCHAIN'] | |
| rem=[] | |
| codes = ['V','L','I','M','F','W','Y','G','A','P','S','T','C','H','R','K','Q','E','N','D','X'] | |
| Hash1={ | |
| 'V':0, | |
| 'L':1, | |
| 'I':2, | |
| 'M':3, | |
| 'F':4, | |
| 'W':5, | |
| 'Y':6, | |
| 'G':7, | |
| 'A':8, | |
| 'P':9, | |
| 'S':10, | |
| 'T':11, | |
| 'C':12, | |
| 'H':13, | |
| 'R':14, | |
| 'K':15, | |
| 'Q':16, | |
| 'E':17, | |
| 'N':18, | |
| 'D':19, | |
| 'X':20 | |
| } | |
| def ReadfastaFile(filename): | |
| seq=[] | |
| name=[] | |
| human="" | |
| fn=open(filename,"r") | |
| S="" | |
| for h in fn: | |
| h=h.rstrip() | |
| if not ">" in h: | |
| S=S+h | |
| fn.close() | |
| S=S.upper() | |
| return(S) | |
| def Predict_profile(sequence, header = header,rem=rem,Hash1 = Hash1): | |
| f=list() | |
| f.append("PDBNO"+"\t") | |
| for i in range(3,23): | |
| f.append(header[i]+"\t") | |
| f.append("X\n") | |
| a = (len(Hash1),len(sequence)) | |
| pred_Profile=np.zeros(a) | |
| for i in range(len(sequence)): | |
| if i not in rem: | |
| T=np.copy(list(sequence)) | |
| T=" ".join(T) | |
| T=T.split(" ") | |
| T[i]='[MASK]' | |
| T=" ".join(T) | |
| l=fill_mask(T) | |
| number=len(l) | |
| for k in range(number): | |
| token=l[k]['token_str'] | |
| token=token.replace("▁","") | |
| score=l[k]['score'] | |
| if token not in Hash1: | |
| print(i,token) | |
| else: | |
| pred_Profile[Hash1[token]][i]=int(score*100) | |
| f.append(str(i+1)) | |
| for k in range(len(Hash1)): #without X | |
| f.append("\t"+str(pred_Profile[k][i])) | |
| f.append("\n") | |
| print(i) | |
| if len(rem)!=0: | |
| pred_Profile=np.delete(pred_Profile,rem,1) | |
| return(pred_Profile) | |
| def Predict_profile1(sequence, header = header,rem=rem,Hash1 = Hash1): | |
| f=list() | |
| f.append("PDBNO"+"\t") | |
| for i in range(3,23): | |
| f.append(header[i]+"\t") | |
| f.append("X\n") | |
| a = (len(Hash1),len(sequence)) | |
| pred_Profile=np.zeros(a) | |
| for i in range(len(sequence)): | |
| if i not in rem: | |
| T=np.copy(list(sequence)) | |
| T=" ".join(T) | |
| T=T.split(" ") | |
| T[i]='[MASK]' | |
| T=" ".join(T) | |
| l=fill_mask(T) | |
| number=len(l) | |
| for k in range(number): | |
| token=l[k]['token_str'] | |
| token=token.replace("▁","") | |
| score=l[k]['score'] | |
| if token not in Hash1: | |
| pred_Profile['X'][i]=pred_Profile['X'][i]+score | |
| else: | |
| pred_Profile[Hash1[token]][i]=score | |
| f.append(str(i+1)) | |
| for k in range(len(Hash1)): #without X | |
| f.append("\t"+str(pred_Profile[k][i])) | |
| f.append("\n") | |
| print(i) | |
| if len(rem)!=0: | |
| pred_Profile=np.delete(pred_Profile,rem,1) | |
| return(pred_Profile) | |
| def print_func(sequence): | |
| s = Predict_profile1(sequence) | |
| ss = list(s) | |
| final = [] | |
| for i in range(len(s)): | |
| # q= np.concatenate((codes[i],s[i])) | |
| q = [str(codes[i])] + str(ss[i]).replace('[','').replace(']','').split(" ") | |
| final.append(q) | |
| res = "\n".join(" ".join(str(el) for el in row) for row in final) | |
| return res | |
| title="Protein sequence profile prediction using ProtAlbert transformer" | |
| description="""Please enter the sequence. | |
| * Prediction process can take longer for long sequences. | |
| """ | |
| iface = gr.Interface(fn=print_func, | |
| inputs=["text"], | |
| outputs="text", | |
| description=description, | |
| title=title) | |
| iface.launch() | |