Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Andrew Stirn
commited on
Commit
·
1ef81e0
1
Parent(s):
eac7d3f
off target scanning
Browse files
tiger.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
-
import
|
| 3 |
import pandas as pd
|
|
|
|
| 4 |
from Bio import SeqIO
|
| 5 |
|
| 6 |
GUIDE_LEN = 23
|
|
@@ -78,13 +79,68 @@ def tiger_predict(transcript_seq: str):
|
|
| 78 |
# get predictions
|
| 79 |
normalized_lfc = tiger.predict_step(model_inputs)
|
| 80 |
predictions = pd.DataFrame({'Guide': guide_seq, 'Normalized LFC': tf.squeeze(normalized_lfc).numpy()})
|
|
|
|
| 81 |
|
| 82 |
return predictions
|
| 83 |
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
if __name__ == '__main__':
|
| 86 |
|
| 87 |
# simple test case
|
| 88 |
-
transcript_sequence = '
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
import numpy as np
|
| 3 |
import pandas as pd
|
| 4 |
+
import tensorflow as tf
|
| 5 |
from Bio import SeqIO
|
| 6 |
|
| 7 |
GUIDE_LEN = 23
|
|
|
|
| 79 |
# get predictions
|
| 80 |
normalized_lfc = tiger.predict_step(model_inputs)
|
| 81 |
predictions = pd.DataFrame({'Guide': guide_seq, 'Normalized LFC': tf.squeeze(normalized_lfc).numpy()})
|
| 82 |
+
predictions = predictions.set_index('Guide').sort_values('Normalized LFC')
|
| 83 |
|
| 84 |
return predictions
|
| 85 |
|
| 86 |
|
| 87 |
+
def find_off_targets(guides, batch_size=1000):
|
| 88 |
+
with open('gencode.v19.pc_transcripts.fa', 'r') as file:
|
| 89 |
+
df_transcripts = pd.DataFrame([(t.id, str(t.seq)) for t in SeqIO.parse(file, 'fasta')], columns=['id', 'seq'])
|
| 90 |
+
df_transcripts['id'] = df_transcripts['id'].apply(lambda s: s.split('|')[4])
|
| 91 |
+
df_transcripts.set_index('id', inplace=True)
|
| 92 |
+
|
| 93 |
+
# one-hot encode guides to form a filter
|
| 94 |
+
guide_filter = one_hot_encode_sequence(sequence_complement(guides), add_context_padding=False)
|
| 95 |
+
guide_filter = tf.transpose(guide_filter, [1, 2, 0])
|
| 96 |
+
|
| 97 |
+
# loop over transcripts in batches
|
| 98 |
+
i = 0
|
| 99 |
+
print('Scanning for off-targets')
|
| 100 |
+
df_off_targets = pd.DataFrame()
|
| 101 |
+
while i < len(df_transcripts):
|
| 102 |
+
# select batch
|
| 103 |
+
df_batch = df_transcripts.iloc[i:min(i + batch_size, len(df_transcripts))]
|
| 104 |
+
i += batch_size
|
| 105 |
+
|
| 106 |
+
# find and log off-targets
|
| 107 |
+
transcripts = one_hot_encode_sequence(df_batch['seq'].values.tolist(), add_context_padding=False)
|
| 108 |
+
num_mismatches = GUIDE_LEN - tf.nn.conv1d(transcripts, guide_filter, stride=1, padding='SAME')
|
| 109 |
+
loc_off_targets = tf.where(num_mismatches <= NUM_MISMATCHES).numpy()
|
| 110 |
+
df_off_targets = pd.concat([df_off_targets, pd.DataFrame({
|
| 111 |
+
'Guide': np.array(guides)[loc_off_targets[:, 2]],
|
| 112 |
+
'Isoform': df_batch.index.values[loc_off_targets[:, 0]],
|
| 113 |
+
'Mismatches': tf.gather_nd(num_mismatches, loc_off_targets).numpy().astype(int),
|
| 114 |
+
'Midpoint': loc_off_targets[:, 1],
|
| 115 |
+
'Target': df_batch['seq'].values[loc_off_targets[:, 0]],
|
| 116 |
+
})])
|
| 117 |
+
|
| 118 |
+
# progress update
|
| 119 |
+
print('\rPercent complete: {:.2f}%'.format(100 * min(i / len(df_transcripts), 1)), end='')
|
| 120 |
+
print('')
|
| 121 |
+
|
| 122 |
+
# trim transcripts to targets
|
| 123 |
+
dict_off_targets = df_off_targets.to_dict('records')
|
| 124 |
+
for row in dict_off_targets:
|
| 125 |
+
start_location = row['Midpoint'] - (GUIDE_LEN // 2) - CONTEXT_5P
|
| 126 |
+
row['Target'] = row['Target'][start_location:start_location + TARGET_LEN]
|
| 127 |
+
if row['Mismatches'] == 0:
|
| 128 |
+
assert row['Guide'] == sequence_complement([row['Target'][CONTEXT_5P:TARGET_LEN-CONTEXT_3P]])[0]
|
| 129 |
+
df_off_targets = pd.DataFrame(dict_off_targets)
|
| 130 |
+
|
| 131 |
+
return df_off_targets
|
| 132 |
+
|
| 133 |
+
|
| 134 |
if __name__ == '__main__':
|
| 135 |
|
| 136 |
# simple test case
|
| 137 |
+
transcript_sequence = 'ATGCAGGACGCGGAGAACGTGGCGGTGCCCGAGGCGGCCGAGGAGCGCGC'.lower() # first 50 from EIF3B-003's CDS
|
| 138 |
+
sorted_predictions = tiger_predict(transcript_sequence)
|
| 139 |
+
|
| 140 |
+
# report top guides only
|
| 141 |
+
sorted_predictions = sorted_predictions.iloc[:NUM_TOP_GUIDES]
|
| 142 |
+
print(sorted_predictions)
|
| 143 |
+
|
| 144 |
+
# scan for off-targets for top guides
|
| 145 |
+
off_targets = find_off_targets(sorted_predictions.index.values.tolist())
|
| 146 |
+
print(off_targets)
|