Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Andrew Stirn
commited on
Commit
·
7932f13
1
Parent(s):
7233b48
cleanup off-target table
Browse files
tiger.py
CHANGED
|
@@ -9,9 +9,10 @@ from Bio import SeqIO
|
|
| 9 |
|
| 10 |
# column names
|
| 11 |
ID_COL = 'Transcript ID'
|
| 12 |
-
SEQ_COL = 'Sequence'
|
| 13 |
TARGET_COL = 'Target Sequence'
|
| 14 |
GUIDE_COL = 'Guide Sequence'
|
|
|
|
| 15 |
SCORE_COL = 'Guide Score'
|
| 16 |
|
| 17 |
# nucleotide tokens
|
|
@@ -224,27 +225,28 @@ def find_off_targets(top_guides: pd.DataFrame, status_bar, status_text):
|
|
| 224 |
|
| 225 |
# log off-targets
|
| 226 |
dict_off_targets = pd.DataFrame({
|
| 227 |
-
'On-target
|
| 228 |
-
|
| 229 |
-
'Off-target
|
| 230 |
-
'
|
| 231 |
-
|
| 232 |
-
|
| 233 |
}).to_dict('records')
|
| 234 |
|
| 235 |
# trim transcripts to targets
|
| 236 |
for row in dict_off_targets:
|
| 237 |
-
start_location = row['Midpoint'] - (GUIDE_LEN // 2)
|
|
|
|
| 238 |
if start_location < CONTEXT_5P:
|
| 239 |
-
row[
|
| 240 |
-
row[
|
| 241 |
-
elif start_location + GUIDE_LEN + CONTEXT_3P > len(row[
|
| 242 |
-
row[
|
| 243 |
-
row[
|
| 244 |
else:
|
| 245 |
-
row[
|
| 246 |
-
if row[
|
| 247 |
-
assert row[
|
| 248 |
|
| 249 |
# append new off-targets
|
| 250 |
off_targets = pd.concat([off_targets, pd.DataFrame(dict_off_targets)])
|
|
@@ -265,14 +267,15 @@ def predict_off_target(off_targets: pd.DataFrame, model: tf.keras.Model):
|
|
| 265 |
if len(off_targets) == 0:
|
| 266 |
return pd.DataFrame()
|
| 267 |
|
| 268 |
-
#
|
| 269 |
model_inputs = tf.concat([
|
| 270 |
-
tf.reshape(one_hot_encode_sequence(off_targets[
|
| 271 |
-
tf.reshape(one_hot_encode_sequence(off_targets[
|
| 272 |
], axis=-1)
|
| 273 |
-
|
|
|
|
| 274 |
|
| 275 |
-
return off_targets.sort_values(SCORE_COL)
|
| 276 |
|
| 277 |
|
| 278 |
def tiger_exhibit(transcripts: pd.DataFrame, mode: str, check_off_targets: bool, status_bar=None, status_text=None):
|
|
@@ -291,7 +294,7 @@ def tiger_exhibit(transcripts: pd.DataFrame, mode: str, check_off_targets: bool,
|
|
| 291 |
off_target_predictions = pd.DataFrame()
|
| 292 |
|
| 293 |
if mode == 'all':
|
| 294 |
-
|
| 295 |
|
| 296 |
elif mode == 'titration': # TODO: and titration candidates
|
| 297 |
on_target_predictions = top_guides_per_transcript(on_target_predictions)
|
|
@@ -301,7 +304,7 @@ def tiger_exhibit(transcripts: pd.DataFrame, mode: str, check_off_targets: bool,
|
|
| 301 |
|
| 302 |
# check off-target effects for top guides
|
| 303 |
if check_off_targets:
|
| 304 |
-
off_targets = find_off_targets(on_target_predictions,
|
| 305 |
off_target_predictions = predict_off_target(off_targets, model=tiger)
|
| 306 |
|
| 307 |
# reverse guide sequences
|
|
|
|
| 9 |
|
| 10 |
# column names
|
| 11 |
ID_COL = 'Transcript ID'
|
| 12 |
+
SEQ_COL = 'Transcript Sequence'
|
| 13 |
TARGET_COL = 'Target Sequence'
|
| 14 |
GUIDE_COL = 'Guide Sequence'
|
| 15 |
+
MM_COL = 'Number of Mismatches'
|
| 16 |
SCORE_COL = 'Guide Score'
|
| 17 |
|
| 18 |
# nucleotide tokens
|
|
|
|
| 225 |
|
| 226 |
# log off-targets
|
| 227 |
dict_off_targets = pd.DataFrame({
|
| 228 |
+
'On-target ' + ID_COL: top_guides.iloc[loc_off_targets[:, 2]][ID_COL],
|
| 229 |
+
GUIDE_COL: top_guides.iloc[loc_off_targets[:, 2]][GUIDE_COL],
|
| 230 |
+
'Off-target ' + ID_COL: df_batch.index.values[loc_off_targets[:, 0]],
|
| 231 |
+
'Guide Midpoint': loc_off_targets[:, 1],
|
| 232 |
+
SEQ_COL: df_batch[SEQ_COL].values[loc_off_targets[:, 0]],
|
| 233 |
+
MM_COL: tf.gather_nd(num_mismatches, loc_off_targets).numpy().astype(int),
|
| 234 |
}).to_dict('records')
|
| 235 |
|
| 236 |
# trim transcripts to targets
|
| 237 |
for row in dict_off_targets:
|
| 238 |
+
start_location = row['Guide Midpoint'] - (GUIDE_LEN // 2)
|
| 239 |
+
del row['Guide Midpoint']
|
| 240 |
if start_location < CONTEXT_5P:
|
| 241 |
+
row[SEQ_COL] = row[SEQ_COL][0:GUIDE_LEN + CONTEXT_3P]
|
| 242 |
+
row[SEQ_COL] = 'N' * (TARGET_LEN - len(row[SEQ_COL])) + row[SEQ_COL]
|
| 243 |
+
elif start_location + GUIDE_LEN + CONTEXT_3P > len(row[SEQ_COL]):
|
| 244 |
+
row[SEQ_COL] = row[SEQ_COL][start_location - CONTEXT_5P:]
|
| 245 |
+
row[SEQ_COL] = row[SEQ_COL] + 'N' * (TARGET_LEN - len(row[SEQ_COL]))
|
| 246 |
else:
|
| 247 |
+
row[SEQ_COL] = row[SEQ_COL][start_location - CONTEXT_5P:start_location + GUIDE_LEN + CONTEXT_3P]
|
| 248 |
+
if row[MM_COL] == 0 and 'N' not in row[SEQ_COL]:
|
| 249 |
+
assert row[GUIDE_COL] == sequence_complement([row[SEQ_COL][CONTEXT_5P:TARGET_LEN - CONTEXT_3P]])[0]
|
| 250 |
|
| 251 |
# append new off-targets
|
| 252 |
off_targets = pd.concat([off_targets, pd.DataFrame(dict_off_targets)])
|
|
|
|
| 267 |
if len(off_targets) == 0:
|
| 268 |
return pd.DataFrame()
|
| 269 |
|
| 270 |
+
# compute off-target predictions
|
| 271 |
model_inputs = tf.concat([
|
| 272 |
+
tf.reshape(one_hot_encode_sequence(off_targets[SEQ_COL], add_context_padding=False), [len(off_targets), -1]),
|
| 273 |
+
tf.reshape(one_hot_encode_sequence(off_targets[GUIDE_COL], add_context_padding=True), [len(off_targets), -1]),
|
| 274 |
], axis=-1)
|
| 275 |
+
lfc_estimate = model.predict(model_inputs, batch_size=BATCH_SIZE_COMPUTE, verbose=False)
|
| 276 |
+
off_targets[SCORE_COL] = prediction_transform(tf.squeeze(lfc_estimate).numpy())
|
| 277 |
|
| 278 |
+
return off_targets.sort_values(SCORE_COL, ascending=False).reset_index(drop=True)
|
| 279 |
|
| 280 |
|
| 281 |
def tiger_exhibit(transcripts: pd.DataFrame, mode: str, check_off_targets: bool, status_bar=None, status_text=None):
|
|
|
|
| 294 |
off_target_predictions = pd.DataFrame()
|
| 295 |
|
| 296 |
if mode == 'all':
|
| 297 |
+
pass
|
| 298 |
|
| 299 |
elif mode == 'titration': # TODO: and titration candidates
|
| 300 |
on_target_predictions = top_guides_per_transcript(on_target_predictions)
|
|
|
|
| 304 |
|
| 305 |
# check off-target effects for top guides
|
| 306 |
if check_off_targets:
|
| 307 |
+
off_targets = find_off_targets(on_target_predictions, status_bar, status_text)
|
| 308 |
off_target_predictions = predict_off_target(off_targets, model=tiger)
|
| 309 |
|
| 310 |
# reverse guide sequences
|