Spaces:

wangjin2000
/

ESM2Bind

Paused

App Files Files Community

ESM2Bind / app.py

wangjin2000

Update app.py

3385d13 verified about 1 year ago

raw

history blame contribute delete

15.1 kB

	#ref: https://huggingface.co/blog/AmelieSchreiber/esmbind
	import gradio as gr

	import os
	# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
	#import wandb
	import numpy as np
	import torch
	import torch.nn as nn
	import pickle
	import xml.etree.ElementTree as ET
	from datetime import datetime
	from sklearn.model_selection import train_test_split
	from sklearn.utils.class_weight import compute_class_weight
	from sklearn.metrics import (
	accuracy_score,
	precision_recall_fscore_support,
	roc_auc_score,
	matthews_corrcoef
	)
	from transformers import (
	AutoModelForTokenClassification,
	AutoTokenizer,
	DataCollatorForTokenClassification,
	TrainingArguments,
	Trainer
	)

	from peft import PeftModel

	from datasets import Dataset
	from accelerate import Accelerator
	# Imports specific to the custom peft lora model
	from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType

	from plot_pdb import plot_struc

	def suggest(option):
	if option == "Plastic degradation protein":
	suggestion = "MGSSHHHHHHSSGLVPRGSHMRGPNPTAASLEASAGPFTVRSFTVSRPSGYGAGTVYYPTNAGGTVGAIAIVPGYTARQSSIKWWGPRLASHGFVVITIDTNSTLDQPSSRSSQQMAALRQVASLNGTSSSPIYGKVDTARMGVMGWSMGGGGSLISAANNPSLKAAAPQAPWDSSTNFSSVTVPTLIFACENDSIAPVNSSALPIYDSMSRNAKQFLEINGGSHSCANSGNSNQALIGKKGVAWMKRFMDNDTRYSTFACENPNSTRVSDFRTANCSLEDPAANKARKEAELAAATAEQ"
	elif option == "Default protein":
	#suggestion = "MAPLRKTYVLKLYVAGNTPNSVRALKTLNNILEKEFKGVYALKVIDVLKNPQLAEEDKILATPTLAKVLPPPVRRIIGDLSNREKVLIGLDLLYEEIGDQAEDDLGLE"
	suggestion = "MAVPETRPNHTIYINNLNEKIKKDELKKSLHAIFSRFGQILDILVSRSLKMRGQAFVIFKEVSSATNALRSMQGFPFYDKPMRIQYAKTDSDIIAKMKGT"
	elif option == "Antifreeze protein":
	suggestion = "QCTGGADCTSCTGACTGCGNCPNAVTCTNSQHCVKANTCTGSTDCNTAQTCTNSKDCFEANTCTDSTNCYKATACTNSSGCPGH"
	elif option == "AI Generated protein":
	suggestion = "MSGMKKLYEYTVTTLDEFLEKLKEFILNTSKDKIYKLTITNPKLIKDIGKAIAKAAEIADVDPKEIEEMIKAVEENELTKLVITIEQTDDKYVIKVELENEDGLVHSFEIYFKNKEEMEKFLELLEKLISKLSGS"
	elif option == "7-bladed propeller fold":
	suggestion = "VKLAGNSSLCPINGWAVYSKDNSIRIGSKGDVFVIREPFISCSHLECRTFFLTQGALLNDKHSNGTVKDRSPHRTLMSCPVGEAPSPYNSRFESVAWSASACHDGTSWLTIGISGPDNGAVAVLKYNGIITDTIKSWRNNILRTQESECACVNGSCFTVMTDGPSNGQASYKIFKMEKGKVVKSVELDAPNYHYEECSCYPNAGEITCVCRDNWHGSNRPWVSFNQNLEYQIGYICSGVFGDNPRPNDGTGSCGPVSSNGAYGVKGFSFKYGNGVWIGRTKSTNSRSGFEMIWDPNGWTETDSSFSVKQDIVAITDWSGYSGSFVQHPELTGLDCIRPCFWVELIRGRPKESTIWTSGSSISFCGVNSDTVGWSWPDGAELPFTIDK"
	else:
	suggestion = ""
	return suggestion

	# Helper Functions and Data Preparation
	def truncate_labels(labels, max_length):
	"""Truncate labels to the specified max_length."""
	return [label[:max_length] for label in labels]

	def compute_metrics(p):
	"""Compute metrics for evaluation."""
	predictions, labels = p
	predictions = np.argmax(predictions, axis=2)

	# Remove padding (-100 labels)
	predictions = predictions[labels != -100].flatten()
	labels = labels[labels != -100].flatten()

	# Compute accuracy
	accuracy = accuracy_score(labels, predictions)

	# Compute precision, recall, F1 score, and AUC
	precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
	auc = roc_auc_score(labels, predictions)

	# Compute MCC
	mcc = matthews_corrcoef(labels, predictions)

	return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1, 'auc': auc, 'mcc': mcc}

	def compute_loss(model, inputs):
	"""Custom compute_loss function."""
	logits = model(**inputs).logits
	labels = inputs["labels"]
	loss_fct = nn.CrossEntropyLoss(weight=class_weights)
	active_loss = inputs["attention_mask"].view(-1) == 1
	active_logits = logits.view(-1, model.config.num_labels)
	active_labels = torch.where(
	active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
	)
	loss = loss_fct(active_logits, active_labels)
	return loss

	# Define Custom Trainer Class
	# Since we are using class weights, due to the imbalance between non-binding residues and binding residues, we will need a custom weighted trainer.
	class WeightedTrainer(Trainer):
	def compute_loss(self, model, inputs, return_outputs=False):
	outputs = model(**inputs)
	loss = compute_loss(model, inputs)
	return (loss, outputs) if return_outputs else loss

	# Predict binding site with finetuned PEFT model
	def predict_bind(base_model_path,PEFT_model_path,input_seq):
	# Load the model
	base_model = AutoModelForTokenClassification.from_pretrained(base_model_path)
	loaded_model = PeftModel.from_pretrained(base_model, PEFT_model_path)

	# Ensure the model is in evaluation mode
	loaded_model.eval()

	# Tokenization
	tokenizer = AutoTokenizer.from_pretrained(base_model_path)

	# Tokenize the sequence
	inputs = tokenizer(input_seq, return_tensors="pt", truncation=True, max_length=1024, padding='max_length')

	# Run the model
	with torch.no_grad():
	logits = loaded_model(**inputs).logits

	# Get predictions
	tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) # Convert input ids back to tokens
	predictions = torch.argmax(logits, dim=2)

	binding_site=[]
	pos = 0
	# Print the predicted labels for each token
	for token, prediction in zip(tokens, predictions[0].numpy()):
	if token not in ['<pad>', '<cls>', '<eos>']:
	pos += 1
	print((pos, token, id2label[prediction]))
	if prediction == 1:
	print((pos, token, id2label[prediction]))
	binding_site.append([pos, token, id2label[prediction]])

	return binding_site

	# fine-tuning function
	def train_function_no_sweeps(base_model_path): #, train_dataset, test_dataset):

	# Set the LoRA config
	config = {
	"lora_alpha": 1, #try 0.5, 1, 2, ..., 16
	"lora_dropout": 0.2,
	"lr": 5.701568055793089e-04,
	"lr_scheduler_type": "cosine",
	"max_grad_norm": 0.5,
	"num_train_epochs": 1, #3, jw 20240628
	"per_device_train_batch_size": 12,
	"r": 2,
	"weight_decay": 0.2,
	# Add other hyperparameters as needed
	}

	base_model = AutoModelForTokenClassification.from_pretrained(base_model_path, num_labels=len(id2label), id2label=id2label, label2id=label2id)

	# Tokenization
	tokenizer = AutoTokenizer.from_pretrained(base_model_path) #("facebook/esm2_t12_35M_UR50D")

	train_tokenized = tokenizer(train_sequences, padding=True, truncation=True, max_length=max_sequence_length, return_tensors="pt", is_split_into_words=False)
	test_tokenized = tokenizer(test_sequences, padding=True, truncation=True, max_length=max_sequence_length, return_tensors="pt", is_split_into_words=False)

	train_dataset = Dataset.from_dict({k: v for k, v in train_tokenized.items()}).add_column("labels", train_labels)
	test_dataset = Dataset.from_dict({k: v for k, v in test_tokenized.items()}).add_column("labels", test_labels)

	# Convert the model into a PeftModel
	peft_config = LoraConfig(
	task_type=TaskType.TOKEN_CLS,
	inference_mode=False,
	r=config["r"],
	lora_alpha=config["lora_alpha"],
	target_modules=["query", "key", "value"], # also try "dense_h_to_4h" and "dense_4h_to_h"
	lora_dropout=config["lora_dropout"],
	bias="none" # or "all" or "lora_only"
	)
	base_model = get_peft_model(base_model, peft_config)

	# Use the accelerator
	base_model = accelerator.prepare(base_model)
	train_dataset = accelerator.prepare(train_dataset)
	test_dataset = accelerator.prepare(test_dataset)

	model_name_base = base_model_path.split("/")[1]
	timestamp = datetime.now().strftime('%Y-%m-%d_%H')
	save_path = f"{model_name_base}-lora-binding-sites_{timestamp}"

	# Training setup
	training_args = TrainingArguments(
	output_dir=save_path, #f"{model_name_base}-lora-binding-sites_{timestamp}",
	learning_rate=config["lr"],
	lr_scheduler_type=config["lr_scheduler_type"],
	gradient_accumulation_steps=1,
	max_grad_norm=config["max_grad_norm"],
	per_device_train_batch_size=config["per_device_train_batch_size"],
	per_device_eval_batch_size=config["per_device_train_batch_size"],
	num_train_epochs=config["num_train_epochs"],
	weight_decay=config["weight_decay"],
	evaluation_strategy="epoch",
	save_strategy="epoch",
	load_best_model_at_end=True,
	metric_for_best_model="f1",
	greater_is_better=True,
	push_to_hub=True, #jw 20240701 False,
	logging_dir=None,
	logging_first_step=False,
	logging_steps=200,
	save_total_limit=7,
	no_cuda=False,
	seed=8893,
	fp16=True,
	#report_to='wandb'
	report_to=None,
	hub_token = HF_TOKEN, #jw 20240701
	)

	# Initialize Trainer
	trainer = WeightedTrainer(
	model=base_model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=test_dataset,
	tokenizer=tokenizer,
	data_collator=DataCollatorForTokenClassification(tokenizer=tokenizer),
	compute_metrics=compute_metrics,

	)

	# Train and Save Model
	trainer.train()

	return save_path

	# Constants & Globals
	HF_TOKEN = os.environ.get("HF_token")
	print("HF_TOKEN:",HF_TOKEN)

	MODEL_OPTIONS = [
	"facebook/esm2_t6_8M_UR50D",
	"facebook/esm2_t12_35M_UR50D",
	"facebook/esm2_t33_650M_UR50D",
	] # models users can choose from

	PEFT_MODEL_OPTIONS = [
	"wangjin2000/esm2_t6_8M-lora-binding-sites_2024-07-02_09-26-54",
	"AmelieSchreiber/esm2_t12_35M_lora_binding_sites_v2_cp3",
	] # finetuned models


	# Load the data from pickle files (replace with your local paths)
	with open("./datasets/train_sequences_chunked_by_family.pkl", "rb") as f:
	train_sequences = pickle.load(f)

	with open("./datasets/test_sequences_chunked_by_family.pkl", "rb") as f:
	test_sequences = pickle.load(f)

	with open("./datasets/train_labels_chunked_by_family.pkl", "rb") as f:
	train_labels = pickle.load(f)

	with open("./datasets/test_labels_chunked_by_family.pkl", "rb") as f:
	test_labels = pickle.load(f)

	max_sequence_length = 1000

	# Directly truncate the entire list of labels
	train_labels = truncate_labels(train_labels, max_sequence_length)
	test_labels = truncate_labels(test_labels, max_sequence_length)

	# Compute Class Weights
	classes = [0, 1]
	flat_train_labels = [label for sublist in train_labels for label in sublist]
	class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=flat_train_labels)
	accelerator = Accelerator()
	class_weights = torch.tensor(class_weights, dtype=torch.float32).to(accelerator.device)

	# Define labels and model
	id2label = {0: "No binding site", 1: "Binding site"}
	label2id = {v: k for k, v in id2label.items()}

	'''
	# debug result
	dubug_result = saved_path #predictions #class_weights
	'''

	demo = gr.Blocks(title="DEMO FOR ESM2Bind")

	with demo:
	gr.Markdown("# DEMO FOR ESM2Bind")
	#gr.Textbox(dubug_result)

	with gr.Column():
	gr.Markdown("## Select a base model and a corresponding PEFT finetune model")

	with gr.Row():
	with gr.Column(scale=5, variant="compact"):
	base_model_name = gr.Dropdown(
	choices=MODEL_OPTIONS,
	value=MODEL_OPTIONS[0],
	label="Base Model Name",
	interactive = True,
	)
	PEFT_model_name = gr.Dropdown(
	choices=PEFT_MODEL_OPTIONS,
	value=PEFT_MODEL_OPTIONS[0],
	label="PEFT Model Name",
	interactive = True,
	)
	with gr.Column(scale=5, variant="compact"):
	name = gr.Dropdown(
	label="Choose a Sample Protein",
	value="Default protein",
	choices=["Default protein", "Antifreeze protein", "Plastic degradation protein", "AI Generated protein", "7-bladed propeller fold", "custom"]
	)
	gr.Markdown(
	"## Predict binding site and Plot structure for selected protein sequence:"
	)
	with gr.Row():
	with gr.Column(variant="compact", scale = 8):
	input_seq = gr.Textbox(
	lines=1,
	max_lines=12,
	label="Protein sequency to be predicted:",
	value="MAVPETRPNHTIYINNLNEKIKKDELKKSLHAIFSRFGQILDILVSRSLKMRGQAFVIFKEVSSATNALRSMQGFPFYDKPMRIQYAKTDSDIIAKMKGT",
	placeholder="Paste your protein sequence here...",
	interactive = True,
	)
	text_pos = gr.Textbox(
	lines=1,
	max_lines=12,
	label="Sequency Position:",
	placeholder=
	"012345678911234567892123456789312345678941234567895123456789612345678971234567898123456789912345678901234567891123456789",
	interactive=False,
	)
	with gr.Column(variant="compact", scale = 2):
	predict_btn = gr.Button(
	value="Predict binding site",
	interactive=True,
	variant="primary",
	)
	plot_struc_btn = gr.Button(value = "Plot ESMFold Predicted Structure ", variant="primary")
	with gr.Row():
	with gr.Column(variant="compact", scale = 5):
	output_text = gr.Textbox(
	lines=1,
	max_lines=12,
	label="Output",
	placeholder="Output",
	)
	with gr.Column(variant="compact", scale = 5):
	finetune_button = gr.Button(
	value="Finetune Pre-trained Model",
	interactive=True,
	variant="primary",
	)
	with gr.Row():
	output_viewer = gr.HTML()
	output_file = gr.File(
	label="Download as Text File",
	file_count="single",
	type="filepath",
	interactive=False,
	)

	# select protein sample
	name.change(fn=suggest, inputs=name, outputs=input_seq)

	# "Predict binding site" actions
	predict_btn.click(
	fn = predict_bind,
	inputs=[base_model_name,PEFT_model_name,input_seq],
	outputs = [output_text],
	)

	# "Finetune Pre-trained Model" actions
	finetune_button.click(
	fn = train_function_no_sweeps,
	inputs=[base_model_name],
	outputs = [output_text],
	)

	# plot protein structure
	plot_struc_btn.click(fn=plot_struc, inputs=input_seq, outputs=[output_file, output_viewer])


	demo.launch()