Spaces:

Knowles-Lab
/

tiger

Running on CPU Upgrade

App Files Files Community

Andrew Stirn commited on Jul 5, 2023

Commit

7932f13

1 Parent(s): 7233b48

cleanup off-target table

Browse files

Files changed (1) hide show

tiger.py +26 -23

tiger.py CHANGED Viewed

@@ -9,9 +9,10 @@ from Bio import SeqIO
 # column names
 ID_COL = 'Transcript ID'
-SEQ_COL = 'Sequence'
 TARGET_COL = 'Target Sequence'
 GUIDE_COL = 'Guide Sequence'
 SCORE_COL = 'Guide Score'
 # nucleotide tokens
@@ -224,27 +225,28 @@ def find_off_targets(top_guides: pd.DataFrame, status_bar, status_text):
             # log off-targets
             dict_off_targets = pd.DataFrame({
-                'On-target ID': top_guides.iloc[loc_off_targets[:, 2]]['On-target ID'],
-                'Guide': top_guides.iloc[loc_off_targets[:, 2]]['Guide'],
-                'Off-target ID': df_batch.index.values[loc_off_targets[:, 0]],
-                'Target': df_batch[SEQ_COL].values[loc_off_targets[:, 0]],
-                'Mismatches': tf.gather_nd(num_mismatches, loc_off_targets).numpy().astype(int),
-                'Midpoint': loc_off_targets[:, 1],
             }).to_dict('records')
             # trim transcripts to targets
             for row in dict_off_targets:
-                start_location = row['Midpoint'] - (GUIDE_LEN // 2)
                 if start_location < CONTEXT_5P:
-                    row['Target'] = row['Target'][0:GUIDE_LEN + CONTEXT_3P]
-                    row['Target'] = 'N' * (TARGET_LEN - len(row['Target'])) + row['Target']
-                elif start_location + GUIDE_LEN + CONTEXT_3P > len(row['Target']):
-                    row['Target'] = row['Target'][start_location - CONTEXT_5P:]
-                    row['Target'] = row['Target'] + 'N' * (TARGET_LEN - len(row['Target']))
                 else:
-                    row['Target'] = row['Target'][start_location - CONTEXT_5P:start_location + GUIDE_LEN + CONTEXT_3P]
-                if row['Mismatches'] == 0 and 'N' not in row['Target']:
-                    assert row['Guide'] == sequence_complement([row['Target'][CONTEXT_5P:TARGET_LEN - CONTEXT_3P]])[0]
             # append new off-targets
             off_targets = pd.concat([off_targets, pd.DataFrame(dict_off_targets)])
@@ -265,14 +267,15 @@ def predict_off_target(off_targets: pd.DataFrame, model: tf.keras.Model):
     if len(off_targets) == 0:
         return pd.DataFrame()
-    # append predictions off-target predictions
     model_inputs = tf.concat([
-        tf.reshape(one_hot_encode_sequence(off_targets['Target'], add_context_padding=False), [len(off_targets), -1]),
-        tf.reshape(one_hot_encode_sequence(off_targets['Guide'], add_context_padding=True), [len(off_targets), -1]),
         ], axis=-1)
-    off_targets[SCORE_COL] = model.predict(model_inputs, batch_size=BATCH_SIZE_COMPUTE, verbose=False)
-    return off_targets.sort_values(SCORE_COL)
 def tiger_exhibit(transcripts: pd.DataFrame, mode: str, check_off_targets: bool, status_bar=None, status_text=None):
@@ -291,7 +294,7 @@ def tiger_exhibit(transcripts: pd.DataFrame, mode: str, check_off_targets: bool,
     off_target_predictions = pd.DataFrame()
     if mode == 'all':
-        return on_target_predictions, off_target_predictions
     elif mode == 'titration':  # TODO: and titration candidates
         on_target_predictions = top_guides_per_transcript(on_target_predictions)
@@ -301,7 +304,7 @@ def tiger_exhibit(transcripts: pd.DataFrame, mode: str, check_off_targets: bool,
     # check off-target effects for top guides
     if check_off_targets:
-        off_targets = find_off_targets(on_target_predictions,  status_bar, status_text)
         off_target_predictions = predict_off_target(off_targets, model=tiger)
     # reverse guide sequences

 # column names
 ID_COL = 'Transcript ID'
+SEQ_COL = 'Transcript Sequence'
 TARGET_COL = 'Target Sequence'
 GUIDE_COL = 'Guide Sequence'
+MM_COL = 'Number of Mismatches'
 SCORE_COL = 'Guide Score'
 # nucleotide tokens
             # log off-targets
             dict_off_targets = pd.DataFrame({
+                'On-target ' + ID_COL: top_guides.iloc[loc_off_targets[:, 2]][ID_COL],
+                GUIDE_COL: top_guides.iloc[loc_off_targets[:, 2]][GUIDE_COL],
+                'Off-target ' + ID_COL: df_batch.index.values[loc_off_targets[:, 0]],
+                'Guide Midpoint': loc_off_targets[:, 1],
+                SEQ_COL: df_batch[SEQ_COL].values[loc_off_targets[:, 0]],
+                MM_COL: tf.gather_nd(num_mismatches, loc_off_targets).numpy().astype(int),
             }).to_dict('records')
             # trim transcripts to targets
             for row in dict_off_targets:
+                start_location = row['Guide Midpoint'] - (GUIDE_LEN // 2)
+                del row['Guide Midpoint']
                 if start_location < CONTEXT_5P:
+                    row[SEQ_COL] = row[SEQ_COL][0:GUIDE_LEN + CONTEXT_3P]
+                    row[SEQ_COL] = 'N' * (TARGET_LEN - len(row[SEQ_COL])) + row[SEQ_COL]
+                elif start_location + GUIDE_LEN + CONTEXT_3P > len(row[SEQ_COL]):
+                    row[SEQ_COL] = row[SEQ_COL][start_location - CONTEXT_5P:]
+                    row[SEQ_COL] = row[SEQ_COL] + 'N' * (TARGET_LEN - len(row[SEQ_COL]))
                 else:
+                    row[SEQ_COL] = row[SEQ_COL][start_location - CONTEXT_5P:start_location + GUIDE_LEN + CONTEXT_3P]
+                if row[MM_COL] == 0 and 'N' not in row[SEQ_COL]:
+                    assert row[GUIDE_COL] == sequence_complement([row[SEQ_COL][CONTEXT_5P:TARGET_LEN - CONTEXT_3P]])[0]
             # append new off-targets
             off_targets = pd.concat([off_targets, pd.DataFrame(dict_off_targets)])
     if len(off_targets) == 0:
         return pd.DataFrame()
+    # compute off-target predictions
     model_inputs = tf.concat([
+        tf.reshape(one_hot_encode_sequence(off_targets[SEQ_COL], add_context_padding=False), [len(off_targets), -1]),
+        tf.reshape(one_hot_encode_sequence(off_targets[GUIDE_COL], add_context_padding=True), [len(off_targets), -1]),
         ], axis=-1)
+    lfc_estimate = model.predict(model_inputs, batch_size=BATCH_SIZE_COMPUTE, verbose=False)
+    off_targets[SCORE_COL] = prediction_transform(tf.squeeze(lfc_estimate).numpy())
+    return off_targets.sort_values(SCORE_COL, ascending=False).reset_index(drop=True)
 def tiger_exhibit(transcripts: pd.DataFrame, mode: str, check_off_targets: bool, status_bar=None, status_text=None):
     off_target_predictions = pd.DataFrame()
     if mode == 'all':
+        pass
     elif mode == 'titration':  # TODO: and titration candidates
         on_target_predictions = top_guides_per_transcript(on_target_predictions)
     # check off-target effects for top guides
     if check_off_targets:
+        off_targets = find_off_targets(on_target_predictions, status_bar, status_text)
         off_target_predictions = predict_off_target(off_targets, model=tiger)
     # reverse guide sequences