Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Andrew Stirn
commited on
Commit
·
e38af10
1
Parent(s):
a690e02
off-targets for gencode.v19.lncRNA_transcripts.fa.gz
Browse files
tiger.py
CHANGED
|
@@ -14,6 +14,12 @@ NUCLEOTIDE_COMPLEMENT = dict(zip(['A', 'C', 'G', 'T'], ['T', 'G', 'C', 'A']))
|
|
| 14 |
NUM_TOP_GUIDES = 10
|
| 15 |
NUM_MISMATCHES = 3
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
def sequence_complement(sequence: list):
|
| 19 |
return [''.join([NUCLEOTIDE_COMPLEMENT[nt] for nt in list(seq)]) for seq in sequence]
|
|
@@ -78,17 +84,22 @@ def predict_on_target(transcript_seq: str, model: tf.keras.Model):
|
|
| 78 |
return predictions
|
| 79 |
|
| 80 |
|
| 81 |
-
def find_off_targets(guides, batch_size=
|
| 82 |
|
| 83 |
# load reference transcripts
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
| 87 |
df_transcripts.set_index('id', inplace=True)
|
|
|
|
| 88 |
|
| 89 |
# one-hot encode guides to form a filter
|
| 90 |
guide_filter = one_hot_encode_sequence(sequence_complement(guides), add_context_padding=False)
|
| 91 |
guide_filter = tf.transpose(guide_filter, [1, 2, 0])
|
|
|
|
| 92 |
|
| 93 |
# loop over transcripts in batches
|
| 94 |
i = 0
|
|
@@ -101,6 +112,7 @@ def find_off_targets(guides, batch_size=1000):
|
|
| 101 |
|
| 102 |
# find and log off-targets
|
| 103 |
transcripts = one_hot_encode_sequence(df_batch['seq'].values.tolist(), add_context_padding=False)
|
|
|
|
| 104 |
num_mismatches = GUIDE_LEN - tf.nn.conv1d(transcripts, guide_filter, stride=1, padding='SAME')
|
| 105 |
loc_off_targets = tf.where(tf.round(num_mismatches) <= NUM_MISMATCHES).numpy()
|
| 106 |
df_off_targets = pd.concat([df_off_targets, pd.DataFrame({
|
|
|
|
| 14 |
NUM_TOP_GUIDES = 10
|
| 15 |
NUM_MISMATCHES = 3
|
| 16 |
|
| 17 |
+
# configure GPUs
|
| 18 |
+
for gpu in tf.config.list_physical_devices('GPU'):
|
| 19 |
+
tf.config.experimental.set_memory_growth(gpu, enable=True)
|
| 20 |
+
if len(tf.config.list_physical_devices('GPU')) > 0:
|
| 21 |
+
tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU')
|
| 22 |
+
|
| 23 |
|
| 24 |
def sequence_complement(sequence: list):
|
| 25 |
return [''.join([NUCLEOTIDE_COMPLEMENT[nt] for nt in list(seq)]) for seq in sequence]
|
|
|
|
| 84 |
return predictions
|
| 85 |
|
| 86 |
|
| 87 |
+
def find_off_targets(guides, batch_size=500):
|
| 88 |
|
| 89 |
# load reference transcripts
|
| 90 |
+
df_transcripts = pd.DataFrame()
|
| 91 |
+
for transcripts in ['gencode.v19.pc_transcripts.fa.gz', 'gencode.v19.lncRNA_transcripts.fa.gz']:
|
| 92 |
+
with gzip.open(os.path.join('transcripts', transcripts), 'rt') as file:
|
| 93 |
+
df = pd.DataFrame([(t.id, str(t.seq)) for t in SeqIO.parse(file, 'fasta')], columns=['id', 'seq'])
|
| 94 |
+
df_transcripts = pd.concat([df_transcripts, df])
|
| 95 |
+
df_transcripts['id'] = df_transcripts['id'].apply(lambda s: s.split('|')[0])
|
| 96 |
df_transcripts.set_index('id', inplace=True)
|
| 97 |
+
assert not df_transcripts.index.has_duplicates
|
| 98 |
|
| 99 |
# one-hot encode guides to form a filter
|
| 100 |
guide_filter = one_hot_encode_sequence(sequence_complement(guides), add_context_padding=False)
|
| 101 |
guide_filter = tf.transpose(guide_filter, [1, 2, 0])
|
| 102 |
+
guide_filter = tf.cast(guide_filter, tf.float16)
|
| 103 |
|
| 104 |
# loop over transcripts in batches
|
| 105 |
i = 0
|
|
|
|
| 112 |
|
| 113 |
# find and log off-targets
|
| 114 |
transcripts = one_hot_encode_sequence(df_batch['seq'].values.tolist(), add_context_padding=False)
|
| 115 |
+
transcripts = tf.cast(transcripts, guide_filter.dtype)
|
| 116 |
num_mismatches = GUIDE_LEN - tf.nn.conv1d(transcripts, guide_filter, stride=1, padding='SAME')
|
| 117 |
loc_off_targets = tf.where(tf.round(num_mismatches) <= NUM_MISMATCHES).numpy()
|
| 118 |
df_off_targets = pd.concat([df_off_targets, pd.DataFrame({
|