Spaces:

wangjin2000
/

ESM2Bind

Paused

App Files Files Community

wangjin2000 commited on Jun 28, 2024

Commit

a8846d6

verified ·

1 Parent(s): 5bbc76e

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -2

app.py CHANGED Viewed

@@ -98,13 +98,49 @@ def train_function_no_sweeps(base_model_path):   #, train_dataset, test_dataset)
         # Add other hyperparameters as needed
     }
     # The base model you will train a LoRA on top of
-    base_model_path = "facebook/esm2_t12_35M_UR50D"
     # Define labels and model
     id2label = {0: "No binding site", 1: "Binding site"}
     label2id = {v: k for k, v in id2label.items()}
     base_model = AutoModelForTokenClassification.from_pretrained(base_model_path, num_labels=len(id2label), id2label=id2label, label2id=label2id)
     # Convert the model into a PeftModel
     peft_config = LoraConfig(
         task_type=TaskType.TOKEN_CLS,
@@ -178,6 +214,7 @@ MODEL_OPTIONS = [
     "facebook/esm2_t33_650M_UR50D",
 ]  # models users can choose from
 # Load the data from pickle files (replace with your local paths)
 with open("./datasets/train_sequences_chunked_by_family.pkl", "rb") as f:
     train_sequences = pickle.load(f)
@@ -213,7 +250,6 @@ class_weights = compute_class_weight(class_weight='balanced', classes=classes, y
 accelerator = Accelerator()
 class_weights = torch.tensor(class_weights, dtype=torch.float32).to(accelerator.device)
-'''
 # inference
 # Path to the saved LoRA model
 model_path = "AmelieSchreiber/esm2_t12_35M_lora_binding_sites_v2_cp3"

         # Add other hyperparameters as needed
     }
     # The base model you will train a LoRA on top of
+    #base_model_path = "facebook/esm2_t12_35M_UR50D"
     # Define labels and model
     id2label = {0: "No binding site", 1: "Binding site"}
     label2id = {v: k for k, v in id2label.items()}
     base_model = AutoModelForTokenClassification.from_pretrained(base_model_path, num_labels=len(id2label), id2label=id2label, label2id=label2id)
+    # Load the data from pickle files (replace with your local paths)
+    with open("./datasets/train_sequences_chunked_by_family.pkl", "rb") as f:
+        train_sequences = pickle.load(f)
+    with open("./datasets/test_sequences_chunked_by_family.pkl", "rb") as f:
+        test_sequences = pickle.load(f)
+    with open("./datasets/train_labels_chunked_by_family.pkl", "rb") as f:
+        train_labels = pickle.load(f)
+    with open("./datasets/test_labels_chunked_by_family.pkl", "rb") as f:
+        test_labels = pickle.load(f)
+    # Tokenization
+    tokenizer = AutoTokenizer.from_pretrained(base_model_path) #("facebook/esm2_t12_35M_UR50D")
+    max_sequence_length = 1000
+    train_tokenized = tokenizer(train_sequences, padding=True, truncation=True, max_length=max_sequence_length, return_tensors="pt", is_split_into_words=False)
+    test_tokenized = tokenizer(test_sequences, padding=True, truncation=True, max_length=max_sequence_length, return_tensors="pt", is_split_into_words=False)
+    # Directly truncate the entire list of labels
+    train_labels = truncate_labels(train_labels, max_sequence_length)
+    test_labels = truncate_labels(test_labels, max_sequence_length)
+    train_dataset = Dataset.from_dict({k: v for k, v in train_tokenized.items()}).add_column("labels", train_labels)
+    test_dataset = Dataset.from_dict({k: v for k, v in test_tokenized.items()}).add_column("labels", test_labels)
+    # Compute Class Weights
+    classes = [0, 1]
+    flat_train_labels = [label for sublist in train_labels for label in sublist]
+    class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=flat_train_labels)
+    accelerator = Accelerator()
+    class_weights = torch.tensor(class_weights, dtype=torch.float32).to(accelerator.device)
     # Convert the model into a PeftModel
     peft_config = LoraConfig(
         task_type=TaskType.TOKEN_CLS,
     "facebook/esm2_t33_650M_UR50D",
 ]  # models users can choose from
+'''
 # Load the data from pickle files (replace with your local paths)
 with open("./datasets/train_sequences_chunked_by_family.pkl", "rb") as f:
     train_sequences = pickle.load(f)
 accelerator = Accelerator()
 class_weights = torch.tensor(class_weights, dtype=torch.float32).to(accelerator.device)
 # inference
 # Path to the saved LoRA model
 model_path = "AmelieSchreiber/esm2_t12_35M_lora_binding_sites_v2_cp3"