Andrew Stirn commited on
Commit
e38af10
·
1 Parent(s): a690e02

off-targets for gencode.v19.lncRNA_transcripts.fa.gz

Browse files
Files changed (1) hide show
  1. tiger.py +16 -4
tiger.py CHANGED
@@ -14,6 +14,12 @@ NUCLEOTIDE_COMPLEMENT = dict(zip(['A', 'C', 'G', 'T'], ['T', 'G', 'C', 'A']))
14
  NUM_TOP_GUIDES = 10
15
  NUM_MISMATCHES = 3
16
 
 
 
 
 
 
 
17
 
18
  def sequence_complement(sequence: list):
19
  return [''.join([NUCLEOTIDE_COMPLEMENT[nt] for nt in list(seq)]) for seq in sequence]
@@ -78,17 +84,22 @@ def predict_on_target(transcript_seq: str, model: tf.keras.Model):
78
  return predictions
79
 
80
 
81
- def find_off_targets(guides, batch_size=1000):
82
 
83
  # load reference transcripts
84
- with gzip.open(os.path.join('transcripts', 'gencode.v19.pc_transcripts.fa.gz'), 'rt') as file:
85
- df_transcripts = pd.DataFrame([(t.id, str(t.seq)) for t in SeqIO.parse(file, 'fasta')], columns=['id', 'seq'])
86
- df_transcripts['id'] = df_transcripts['id'].apply(lambda s: s.split('|')[4])
 
 
 
87
  df_transcripts.set_index('id', inplace=True)
 
88
 
89
  # one-hot encode guides to form a filter
90
  guide_filter = one_hot_encode_sequence(sequence_complement(guides), add_context_padding=False)
91
  guide_filter = tf.transpose(guide_filter, [1, 2, 0])
 
92
 
93
  # loop over transcripts in batches
94
  i = 0
@@ -101,6 +112,7 @@ def find_off_targets(guides, batch_size=1000):
101
 
102
  # find and log off-targets
103
  transcripts = one_hot_encode_sequence(df_batch['seq'].values.tolist(), add_context_padding=False)
 
104
  num_mismatches = GUIDE_LEN - tf.nn.conv1d(transcripts, guide_filter, stride=1, padding='SAME')
105
  loc_off_targets = tf.where(tf.round(num_mismatches) <= NUM_MISMATCHES).numpy()
106
  df_off_targets = pd.concat([df_off_targets, pd.DataFrame({
 
14
  NUM_TOP_GUIDES = 10
15
  NUM_MISMATCHES = 3
16
 
17
+ # configure GPUs
18
+ for gpu in tf.config.list_physical_devices('GPU'):
19
+ tf.config.experimental.set_memory_growth(gpu, enable=True)
20
+ if len(tf.config.list_physical_devices('GPU')) > 0:
21
+ tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU')
22
+
23
 
24
  def sequence_complement(sequence: list):
25
  return [''.join([NUCLEOTIDE_COMPLEMENT[nt] for nt in list(seq)]) for seq in sequence]
 
84
  return predictions
85
 
86
 
87
+ def find_off_targets(guides, batch_size=500):
88
 
89
  # load reference transcripts
90
+ df_transcripts = pd.DataFrame()
91
+ for transcripts in ['gencode.v19.pc_transcripts.fa.gz', 'gencode.v19.lncRNA_transcripts.fa.gz']:
92
+ with gzip.open(os.path.join('transcripts', transcripts), 'rt') as file:
93
+ df = pd.DataFrame([(t.id, str(t.seq)) for t in SeqIO.parse(file, 'fasta')], columns=['id', 'seq'])
94
+ df_transcripts = pd.concat([df_transcripts, df])
95
+ df_transcripts['id'] = df_transcripts['id'].apply(lambda s: s.split('|')[0])
96
  df_transcripts.set_index('id', inplace=True)
97
+ assert not df_transcripts.index.has_duplicates
98
 
99
  # one-hot encode guides to form a filter
100
  guide_filter = one_hot_encode_sequence(sequence_complement(guides), add_context_padding=False)
101
  guide_filter = tf.transpose(guide_filter, [1, 2, 0])
102
+ guide_filter = tf.cast(guide_filter, tf.float16)
103
 
104
  # loop over transcripts in batches
105
  i = 0
 
112
 
113
  # find and log off-targets
114
  transcripts = one_hot_encode_sequence(df_batch['seq'].values.tolist(), add_context_padding=False)
115
+ transcripts = tf.cast(transcripts, guide_filter.dtype)
116
  num_mismatches = GUIDE_LEN - tf.nn.conv1d(transcripts, guide_filter, stride=1, padding='SAME')
117
  loc_off_targets = tf.where(tf.round(num_mismatches) <= NUM_MISMATCHES).numpy()
118
  df_off_targets = pd.concat([df_off_targets, pd.DataFrame({