Andrew Stirn commited on
Commit
d78d0d1
·
1 Parent(s): 82425ea

load_transcripts function

Browse files
Files changed (1) hide show
  1. tiger.py +30 -11
tiger.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import gzip
3
  import numpy as np
@@ -13,6 +14,7 @@ NUCLEOTIDE_TOKENS = dict(zip(['A', 'C', 'G', 'T', 'N'], [0, 1, 2, 3, 255]))
13
  NUCLEOTIDE_COMPLEMENT = dict(zip(['A', 'C', 'G', 'T'], ['T', 'G', 'C', 'A']))
14
  NUM_TOP_GUIDES = 10
15
  NUM_MISMATCHES = 3
 
16
 
17
  # configure GPUs
18
  for gpu in tf.config.list_physical_devices('GPU'):
@@ -21,6 +23,30 @@ if len(tf.config.list_physical_devices('GPU')) > 0:
21
  tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU')
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def sequence_complement(sequence: list):
25
  return [''.join([NUCLEOTIDE_COMPLEMENT[nt] for nt in list(seq)]) for seq in sequence]
26
 
@@ -87,14 +113,7 @@ def predict_on_target(transcript_seq: str, model: tf.keras.Model):
87
  def find_off_targets(guides, batch_size=500):
88
 
89
  # load reference transcripts
90
- df_transcripts = pd.DataFrame()
91
- for transcripts in ['gencode.v19.pc_transcripts.fa.gz', 'gencode.v19.lncRNA_transcripts.fa.gz']:
92
- with gzip.open(os.path.join('transcripts', transcripts), 'rt') as file:
93
- df = pd.DataFrame([(t.id, str(t.seq)) for t in SeqIO.parse(file, 'fasta')], columns=['id', 'seq'])
94
- df_transcripts = pd.concat([df_transcripts, df])
95
- df_transcripts['id'] = df_transcripts['id'].apply(lambda s: s.split('|')[0])
96
- df_transcripts.set_index('id', inplace=True)
97
- assert not df_transcripts.index.has_duplicates
98
 
99
  # one-hot encode guides to form a filter
100
  guide_filter = one_hot_encode_sequence(sequence_complement(guides), add_context_padding=False)
@@ -105,9 +124,9 @@ def find_off_targets(guides, batch_size=500):
105
  i = 0
106
  print('Scanning for off-targets')
107
  df_off_targets = pd.DataFrame()
108
- while i < len(df_transcripts):
109
  # select batch
110
- df_batch = df_transcripts.iloc[i:min(i + batch_size, len(df_transcripts))]
111
  i += batch_size
112
 
113
  # find and log off-targets
@@ -124,7 +143,7 @@ def find_off_targets(guides, batch_size=500):
124
  })])
125
 
126
  # progress update
127
- print('\rPercent complete: {:.2f}%'.format(100 * min(i / len(df_transcripts), 1)), end='')
128
  print('')
129
 
130
  # trim transcripts to targets
 
1
+ import argparse
2
  import os
3
  import gzip
4
  import numpy as np
 
14
  NUCLEOTIDE_COMPLEMENT = dict(zip(['A', 'C', 'G', 'T'], ['T', 'G', 'C', 'A']))
15
  NUM_TOP_GUIDES = 10
16
  NUM_MISMATCHES = 3
17
+ REFERENCE_TRANSCRIPTS = ('gencode.v19.pc_transcripts.fa.gz', 'gencode.v19.lncRNA_transcripts.fa.gz')
18
 
19
  # configure GPUs
20
  for gpu in tf.config.list_physical_devices('GPU'):
 
23
  tf.config.experimental.set_visible_devices(tf.config.list_physical_devices('GPU')[0], 'GPU')
24
 
25
 
26
+ def load_transcripts(fasta_files):
27
+
28
+ # load all transcripts from fasta files into a DataFrame
29
+ transcripts = pd.DataFrame()
30
+ for file in fasta_files:
31
+ try:
32
+ if os.path.splitext(file)[1] == '.gz':
33
+ with gzip.open(file, 'rt') as f:
34
+ df = pd.DataFrame([(t.id, str(t.seq)) for t in SeqIO.parse(f, 'fasta')], columns=['id', 'seq'])
35
+ else:
36
+ df = pd.DataFrame([(t.id, str(t.seq)) for t in SeqIO.parse(f, 'fasta')], columns=['id', 'seq'])
37
+ except Exception as e:
38
+ print(e, 'while loading', file)
39
+ continue
40
+ transcripts = pd.concat([transcripts, df])
41
+
42
+ # set index
43
+ transcripts['id'] = transcripts['id'].apply(lambda s: s.split('|')[0])
44
+ transcripts.set_index('id', inplace=True)
45
+ assert not transcripts.index.has_duplicates
46
+
47
+ return transcripts
48
+
49
+
50
  def sequence_complement(sequence: list):
51
  return [''.join([NUCLEOTIDE_COMPLEMENT[nt] for nt in list(seq)]) for seq in sequence]
52
 
 
113
  def find_off_targets(guides, batch_size=500):
114
 
115
  # load reference transcripts
116
+ reference_transcripts = load_transcripts([os.path.join('transcripts', f) for f in REFERENCE_TRANSCRIPTS])
 
 
 
 
 
 
 
117
 
118
  # one-hot encode guides to form a filter
119
  guide_filter = one_hot_encode_sequence(sequence_complement(guides), add_context_padding=False)
 
124
  i = 0
125
  print('Scanning for off-targets')
126
  df_off_targets = pd.DataFrame()
127
+ while i < len(reference_transcripts):
128
  # select batch
129
+ df_batch = reference_transcripts.iloc[i:min(i + batch_size, len(reference_transcripts))]
130
  i += batch_size
131
 
132
  # find and log off-targets
 
143
  })])
144
 
145
  # progress update
146
+ print('\rPercent complete: {:.2f}%'.format(100 * min(i / len(reference_transcripts), 1)), end='')
147
  print('')
148
 
149
  # trim transcripts to targets