Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import os | |
| import gc | |
| import json | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| from collections import Counter | |
| from sklearn.preprocessing import LabelEncoder | |
| from keras.models import Model | |
| from keras.regularizers import l2 | |
| from keras.constraints import max_norm | |
| from keras.utils import to_categorical | |
| from keras.preprocessing.text import Tokenizer | |
| from keras.utils import pad_sequences | |
| from keras.callbacks import EarlyStopping | |
| from keras.layers import Input, Dense, Dropout, Flatten, Activation | |
| from keras.layers import Conv1D, Add, MaxPooling1D, BatchNormalization | |
| from keras.layers import Embedding, Bidirectional, LSTM, CuDNNLSTM, GlobalMaxPooling1D | |
| import tensorflow as tf | |
| from huggingface_hub import hf_hub_url, cached_download | |
| class Sequence: | |
| codes = {c: i+1 for i, c in enumerate(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y'])} | |
| def integer_encoding(cls, data): | |
| """ | |
| - Encodes code sequence to integer values. | |
| - 20 common amino acids are taken into consideration | |
| and remaining four are categorized as 0. | |
| """ | |
| return np.array([cls.codes.get(code, 0) for code in data]) | |
| def prepare(cls, sequence): | |
| sequence = sequence.strip().upper() | |
| ie = cls.integer_encoding(sequence) | |
| max_length = 100 | |
| padded_ie = pad_sequences([ie], maxlen=max_length, padding='post', truncating='post') | |
| all_ohe = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] + [0]*(100-21)) | |
| return padded_ie, to_categorical(np.array([padded_ie[0], all_ohe]))[:1] | |
| def residual_block(data, filters, d_rate): | |
| """ | |
| _data: input | |
| _filters: convolution filters | |
| _d_rate: dilation rate | |
| """ | |
| shortcut = data | |
| bn1 = BatchNormalization()(data) | |
| act1 = Activation('relu')(bn1) | |
| conv1 = Conv1D(filters, 1, dilation_rate=d_rate, padding='same', kernel_regularizer=l2(0.001))(act1) | |
| #bottleneck convolution | |
| bn2 = BatchNormalization()(conv1) | |
| act2 = Activation('relu')(bn2) | |
| conv2 = Conv1D(filters, 3, padding='same', kernel_regularizer=l2(0.001))(act2) | |
| #skip connection | |
| x = Add()([conv2, shortcut]) | |
| return x | |
| def get_model(): | |
| # model | |
| x_input = Input(shape=(100, 21)) | |
| #initial conv | |
| conv = Conv1D(128, 1, padding='same')(x_input) | |
| # per-residue representation | |
| res1 = residual_block(conv, 128, 2) | |
| res2 = residual_block(res1, 128, 3) | |
| x = MaxPooling1D(3)(res2) | |
| x = Dropout(0.5)(x) | |
| # softmax classifier | |
| x = Flatten()(x) | |
| x_output = Dense(1000, activation='softmax', kernel_regularizer=l2(0.0001))(x) | |
| model2 = Model(inputs=x_input, outputs=x_output) | |
| model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) | |
| weights = cached_download(hf_hub_url("jonathang/Protein_Family_Models", 'model2.h5')) | |
| model2.load_weights(weights) | |
| return model2 | |
| def get_lstm_model(): | |
| x_input = Input(shape=(100,)) | |
| emb = Embedding(21, 128, input_length=100)(x_input) | |
| bi_rnn = Bidirectional(LSTM(64, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01)))(emb) | |
| # bi_rnn = CuDNNLSTM(64, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01))(emb) | |
| x = Dropout(0.3)(bi_rnn) | |
| # softmax classifier | |
| x_output = Dense(1000, activation='softmax')(x) | |
| model1 = Model(inputs=x_input, outputs=x_output) | |
| model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) | |
| weights = cached_download(hf_hub_url("jonathang/Protein_Family_Models", 'model1.h5')) | |
| model1.load_weights(weights) | |
| return model1 | |
| cnn_model = get_model() | |
| lstm_model = get_lstm_model() | |
| mappings_path = cached_download(hf_hub_url("jonathang/Protein_Family_Models", 'prot_mappings.json')) | |
| with open(mappings_path) as f: | |
| prot_mappings = json.load(f) | |
| def greet(Amino_Acid_Sequence): | |
| padded_seq, processed_seq = Sequence.prepare(Amino_Acid_Sequence) | |
| cnn_raw_prediction = cnn_model.predict(processed_seq)[0] | |
| lstm_raw_prediction = lstm_model.predict(padded_seq)[0] | |
| joined_prediction = cnn_raw_prediction*0.7 + lstm_raw_prediction*0.3 | |
| cnn_idx = cnn_raw_prediction.argmax() | |
| lstm_idx = lstm_raw_prediction.argmax() | |
| idx = joined_prediction.argmax() | |
| cnn_fam_asc = prot_mappings['id2fam_asc'][str(cnn_idx)] | |
| cnn_fam_id = prot_mappings['fam_asc2fam_id'][cnn_fam_asc] | |
| lstm_fam_asc = prot_mappings['id2fam_asc'][str(lstm_idx)] | |
| lstm_fam_id = prot_mappings['fam_asc2fam_id'][lstm_fam_asc] | |
| fam_asc = prot_mappings['id2fam_asc'][str(idx)] | |
| fam_id = prot_mappings['fam_asc2fam_id'][fam_asc] | |
| joined_probs = {prot_mappings['id2fam_asc'][str(i)] + ' ' + prot_mappings['fam_asc2fam_id'][prot_mappings['id2fam_asc'][str(i)]]: float(joined_prediction[i]) for i in range(len(joined_prediction))} | |
| cnn_probs = {prot_mappings['id2fam_asc'][str(i)] + ' ' + prot_mappings['fam_asc2fam_id'][prot_mappings['id2fam_asc'][str(i)]]: float(cnn_raw_prediction[i]) for i in range(len(cnn_raw_prediction))} | |
| lstm_probs = {prot_mappings['id2fam_asc'][str(i)] + ' ' + prot_mappings['fam_asc2fam_id'][prot_mappings['id2fam_asc'][str(i)]]: float(lstm_raw_prediction[i]) for i in range(len(lstm_raw_prediction))} | |
| gc.collect() | |
| return joined_probs, cnn_probs, lstm_probs, f""" | |
| Input is {Amino_Acid_Sequence}. | |
| Processed input is: | |
| {processed_seq} | |
| CNN says: Family Accession={cnn_fam_asc} and ID={cnn_fam_id} | |
| LSTM says: Family Accession={lstm_fam_asc} and ID={lstm_fam_id} | |
| 0.7 * cnn and 0.3 * lstm ensemble model makes prediction which maps to: | |
| Family Accession={fam_asc} and ID={fam_id} | |
| Raw Joined Prediction: | |
| {joined_prediction} | |
| """ | |
| iface = gr.Interface(fn=greet, inputs="text", outputs=[gr.Label(num_top_classes=5, label="Ensemble Family Predictions"), gr.Label(num_top_classes=5, label="CNN Family Predictions"), gr.Label(num_top_classes=5, label="LSTM Family Predictions"), "text"]) | |
| iface.launch() | |