Andrew Stirn commited on
Commit
7932f13
·
1 Parent(s): 7233b48

cleanup off-target table

Browse files
Files changed (1) hide show
  1. tiger.py +26 -23
tiger.py CHANGED
@@ -9,9 +9,10 @@ from Bio import SeqIO
9
 
10
  # column names
11
  ID_COL = 'Transcript ID'
12
- SEQ_COL = 'Sequence'
13
  TARGET_COL = 'Target Sequence'
14
  GUIDE_COL = 'Guide Sequence'
 
15
  SCORE_COL = 'Guide Score'
16
 
17
  # nucleotide tokens
@@ -224,27 +225,28 @@ def find_off_targets(top_guides: pd.DataFrame, status_bar, status_text):
224
 
225
  # log off-targets
226
  dict_off_targets = pd.DataFrame({
227
- 'On-target ID': top_guides.iloc[loc_off_targets[:, 2]]['On-target ID'],
228
- 'Guide': top_guides.iloc[loc_off_targets[:, 2]]['Guide'],
229
- 'Off-target ID': df_batch.index.values[loc_off_targets[:, 0]],
230
- 'Target': df_batch[SEQ_COL].values[loc_off_targets[:, 0]],
231
- 'Mismatches': tf.gather_nd(num_mismatches, loc_off_targets).numpy().astype(int),
232
- 'Midpoint': loc_off_targets[:, 1],
233
  }).to_dict('records')
234
 
235
  # trim transcripts to targets
236
  for row in dict_off_targets:
237
- start_location = row['Midpoint'] - (GUIDE_LEN // 2)
 
238
  if start_location < CONTEXT_5P:
239
- row['Target'] = row['Target'][0:GUIDE_LEN + CONTEXT_3P]
240
- row['Target'] = 'N' * (TARGET_LEN - len(row['Target'])) + row['Target']
241
- elif start_location + GUIDE_LEN + CONTEXT_3P > len(row['Target']):
242
- row['Target'] = row['Target'][start_location - CONTEXT_5P:]
243
- row['Target'] = row['Target'] + 'N' * (TARGET_LEN - len(row['Target']))
244
  else:
245
- row['Target'] = row['Target'][start_location - CONTEXT_5P:start_location + GUIDE_LEN + CONTEXT_3P]
246
- if row['Mismatches'] == 0 and 'N' not in row['Target']:
247
- assert row['Guide'] == sequence_complement([row['Target'][CONTEXT_5P:TARGET_LEN - CONTEXT_3P]])[0]
248
 
249
  # append new off-targets
250
  off_targets = pd.concat([off_targets, pd.DataFrame(dict_off_targets)])
@@ -265,14 +267,15 @@ def predict_off_target(off_targets: pd.DataFrame, model: tf.keras.Model):
265
  if len(off_targets) == 0:
266
  return pd.DataFrame()
267
 
268
- # append predictions off-target predictions
269
  model_inputs = tf.concat([
270
- tf.reshape(one_hot_encode_sequence(off_targets['Target'], add_context_padding=False), [len(off_targets), -1]),
271
- tf.reshape(one_hot_encode_sequence(off_targets['Guide'], add_context_padding=True), [len(off_targets), -1]),
272
  ], axis=-1)
273
- off_targets[SCORE_COL] = model.predict(model_inputs, batch_size=BATCH_SIZE_COMPUTE, verbose=False)
 
274
 
275
- return off_targets.sort_values(SCORE_COL)
276
 
277
 
278
  def tiger_exhibit(transcripts: pd.DataFrame, mode: str, check_off_targets: bool, status_bar=None, status_text=None):
@@ -291,7 +294,7 @@ def tiger_exhibit(transcripts: pd.DataFrame, mode: str, check_off_targets: bool,
291
  off_target_predictions = pd.DataFrame()
292
 
293
  if mode == 'all':
294
- return on_target_predictions, off_target_predictions
295
 
296
  elif mode == 'titration': # TODO: and titration candidates
297
  on_target_predictions = top_guides_per_transcript(on_target_predictions)
@@ -301,7 +304,7 @@ def tiger_exhibit(transcripts: pd.DataFrame, mode: str, check_off_targets: bool,
301
 
302
  # check off-target effects for top guides
303
  if check_off_targets:
304
- off_targets = find_off_targets(on_target_predictions, status_bar, status_text)
305
  off_target_predictions = predict_off_target(off_targets, model=tiger)
306
 
307
  # reverse guide sequences
 
9
 
10
  # column names
11
  ID_COL = 'Transcript ID'
12
+ SEQ_COL = 'Transcript Sequence'
13
  TARGET_COL = 'Target Sequence'
14
  GUIDE_COL = 'Guide Sequence'
15
+ MM_COL = 'Number of Mismatches'
16
  SCORE_COL = 'Guide Score'
17
 
18
  # nucleotide tokens
 
225
 
226
  # log off-targets
227
  dict_off_targets = pd.DataFrame({
228
+ 'On-target ' + ID_COL: top_guides.iloc[loc_off_targets[:, 2]][ID_COL],
229
+ GUIDE_COL: top_guides.iloc[loc_off_targets[:, 2]][GUIDE_COL],
230
+ 'Off-target ' + ID_COL: df_batch.index.values[loc_off_targets[:, 0]],
231
+ 'Guide Midpoint': loc_off_targets[:, 1],
232
+ SEQ_COL: df_batch[SEQ_COL].values[loc_off_targets[:, 0]],
233
+ MM_COL: tf.gather_nd(num_mismatches, loc_off_targets).numpy().astype(int),
234
  }).to_dict('records')
235
 
236
  # trim transcripts to targets
237
  for row in dict_off_targets:
238
+ start_location = row['Guide Midpoint'] - (GUIDE_LEN // 2)
239
+ del row['Guide Midpoint']
240
  if start_location < CONTEXT_5P:
241
+ row[SEQ_COL] = row[SEQ_COL][0:GUIDE_LEN + CONTEXT_3P]
242
+ row[SEQ_COL] = 'N' * (TARGET_LEN - len(row[SEQ_COL])) + row[SEQ_COL]
243
+ elif start_location + GUIDE_LEN + CONTEXT_3P > len(row[SEQ_COL]):
244
+ row[SEQ_COL] = row[SEQ_COL][start_location - CONTEXT_5P:]
245
+ row[SEQ_COL] = row[SEQ_COL] + 'N' * (TARGET_LEN - len(row[SEQ_COL]))
246
  else:
247
+ row[SEQ_COL] = row[SEQ_COL][start_location - CONTEXT_5P:start_location + GUIDE_LEN + CONTEXT_3P]
248
+ if row[MM_COL] == 0 and 'N' not in row[SEQ_COL]:
249
+ assert row[GUIDE_COL] == sequence_complement([row[SEQ_COL][CONTEXT_5P:TARGET_LEN - CONTEXT_3P]])[0]
250
 
251
  # append new off-targets
252
  off_targets = pd.concat([off_targets, pd.DataFrame(dict_off_targets)])
 
267
  if len(off_targets) == 0:
268
  return pd.DataFrame()
269
 
270
+ # compute off-target predictions
271
  model_inputs = tf.concat([
272
+ tf.reshape(one_hot_encode_sequence(off_targets[SEQ_COL], add_context_padding=False), [len(off_targets), -1]),
273
+ tf.reshape(one_hot_encode_sequence(off_targets[GUIDE_COL], add_context_padding=True), [len(off_targets), -1]),
274
  ], axis=-1)
275
+ lfc_estimate = model.predict(model_inputs, batch_size=BATCH_SIZE_COMPUTE, verbose=False)
276
+ off_targets[SCORE_COL] = prediction_transform(tf.squeeze(lfc_estimate).numpy())
277
 
278
+ return off_targets.sort_values(SCORE_COL, ascending=False).reset_index(drop=True)
279
 
280
 
281
  def tiger_exhibit(transcripts: pd.DataFrame, mode: str, check_off_targets: bool, status_bar=None, status_text=None):
 
294
  off_target_predictions = pd.DataFrame()
295
 
296
  if mode == 'all':
297
+ pass
298
 
299
  elif mode == 'titration': # TODO: and titration candidates
300
  on_target_predictions = top_guides_per_transcript(on_target_predictions)
 
304
 
305
  # check off-target effects for top guides
306
  if check_off_targets:
307
+ off_targets = find_off_targets(on_target_predictions, status_bar, status_text)
308
  off_target_predictions = predict_off_target(off_targets, model=tiger)
309
 
310
  # reverse guide sequences