Mercurial > repos > galaxyp > translate_bed_sequences
comparison translate_bed_sequences.py @ 2:4221664a2bd0 draft default tip
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/translate_bed_sequences commit 2a470e2c775a7427aa530e058510e4dc7b6d8e80"
author | galaxyp |
---|---|
date | Tue, 07 Apr 2020 11:45:53 -0400 |
parents | d723eb657f1d |
children |
comparison
equal
deleted
inserted
replaced
1:6bbce76c78c1 | 2:4221664a2bd0 |
---|---|
8 # Author: | 8 # Author: |
9 # | 9 # |
10 # James E Johnson | 10 # James E Johnson |
11 # | 11 # |
12 #------------------------------------------------------------------------------ | 12 #------------------------------------------------------------------------------ |
13 """ | 13 |
14 | |
15 """ | |
16 Input: BED file (12 column) + 13th sequence column appended by extract_genomic_dna | 14 Input: BED file (12 column) + 13th sequence column appended by extract_genomic_dna |
17 Output: Fasta of 3-frame translations of the spliced sequence | 15 Output: Fasta of 3-frame translations of the spliced sequence |
18 | |
19 """ | 16 """ |
20 | 17 |
21 import sys,re,os.path | 18 import optparse |
19 import os.path | |
20 import sys | |
22 import tempfile | 21 import tempfile |
23 import optparse | 22 |
24 from optparse import OptionParser | 23 from Bio.Seq import ( |
25 from Bio.Seq import reverse_complement, transcribe, back_transcribe, translate | 24 reverse_complement, |
26 | 25 translate |
27 class BedEntry( object ): | 26 ) |
28 def __init__(self, line): | 27 |
29 self.line = line | 28 |
29 class BedEntry(object): | |
30 def __init__(self, line): | |
31 self.line = line | |
32 try: | |
33 fields = line.rstrip('\r\n').split('\t') | |
34 (chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts) = fields[0:12] | |
35 seq = fields[12] if len(fields) > 12 else None | |
36 self.chrom = chrom | |
37 self.chromStart = int(chromStart) | |
38 self.chromEnd = int(chromEnd) | |
39 self.name = name | |
40 self.score = int(score) | |
41 self.strand = strand | |
42 self.thickStart = int(thickStart) | |
43 self.thickEnd = int(thickEnd) | |
44 self.itemRgb = itemRgb | |
45 self.blockCount = int(blockCount) | |
46 self.blockSizes = [int(x) for x in blockSizes.split(',')] | |
47 self.blockStarts = [int(x) for x in blockStarts.split(',')] | |
48 self.seq = seq | |
49 except Exception as e: | |
50 sys.stderr.write("Unable to read Bed entry %s\n" % e) | |
51 exit(1) | |
52 | |
53 def __str__(self): | |
54 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s%s' % ( | |
55 self.chrom, self.chromStart, self.chromEnd, self.name, self.score, self.strand, self.thickStart, self.thickEnd, self.itemRgb, self.blockCount, | |
56 ','.join([str(x) for x in self.blockSizes]), | |
57 ','.join([str(x) for x in self.blockStarts]), | |
58 '\t%s' % self.seq if self.seq else '') | |
59 | |
60 def get_splice_junctions(self): | |
61 splice_juncs = [] | |
62 for i in range(self.blockCount - 1): | |
63 splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i + 1]) | |
64 splice_juncs.append(splice_junc) | |
65 return splice_juncs | |
66 | |
67 def get_exon_seqs(self): | |
68 exons = [] | |
69 for i in range(self.blockCount): | |
70 # splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) | |
71 exons.append(self.seq[self.blockStarts[i]:self.blockStarts[i] + self.blockSizes[i]]) | |
72 if self.strand == '-': # reverse complement | |
73 exons.reverse() | |
74 for i, s in enumerate(exons): | |
75 exons[i] = reverse_complement(s) | |
76 return exons | |
77 | |
78 def get_spliced_seq(self): | |
79 seq = ''.join(self.get_exon_seqs()) | |
80 return seq | |
81 | |
82 def get_translation(self, sequence=None): | |
83 translation = None | |
84 seq = sequence if sequence else self.get_spliced_seq() | |
85 if seq: | |
86 seqlen = int(len(seq) / 3) * 3 | |
87 if seqlen >= 3: | |
88 translation = translate(seq[:seqlen]) | |
89 return translation | |
90 | |
91 def get_translations(self): | |
92 translations = [] | |
93 seq = self.get_spliced_seq() | |
94 if seq: | |
95 for i in range(3): | |
96 translation = self.get_translation(sequence=seq[i:]) | |
97 if translation: | |
98 translations.append(translation) | |
99 return translations | |
100 | |
101 def get_subrange(self, tstart, tstop): | |
102 """ | |
103 (start, end) | |
104 """ | |
105 chromStart = self.chromStart | |
106 chromEnd = self.chromEnd | |
107 r = range(self.blockCount) | |
108 if self.strand == '-': | |
109 r = list(r) | |
110 r.reverse() | |
111 bStart = 0 | |
112 for x in r: | |
113 bEnd = bStart + self.blockSizes[x] | |
114 if bStart <= tstart < bEnd: | |
115 if self.strand == '+': | |
116 chromStart = self.chromStart + self.blockStarts[x] + (tstart - bStart) | |
117 else: | |
118 chromEnd = self.chromStart + self.blockStarts[x] + (tstart - bStart) | |
119 if bStart <= tstop < bEnd: | |
120 if self.strand == '+': | |
121 chromEnd = self.chromStart + self.blockStarts[x] + (tstop - bStart) | |
122 else: | |
123 chromStart = self.chromStart + self.blockStarts[x] + self.blockSizes[x] - (tstop - bStart) | |
124 bStart += self.blockSizes[x] | |
125 return(chromStart, chromEnd) | |
126 | |
127 def get_blocks(self, chromStart, chromEnd): | |
128 """ | |
129 get the blocks for sub range | |
130 """ | |
131 tblockCount = 0 | |
132 tblockSizes = [] | |
133 tblockStarts = [] | |
134 for x in range(self.blockCount): | |
135 bStart = self.chromStart + self.blockStarts[x] | |
136 bEnd = bStart + self.blockSizes[x] | |
137 if bStart > chromEnd: | |
138 break | |
139 if bEnd < chromStart: | |
140 continue | |
141 cStart = max(chromStart, bStart) | |
142 tblockStarts.append(cStart - chromStart) | |
143 tblockSizes.append(min(chromEnd, bEnd) - cStart) | |
144 tblockCount += 1 | |
145 # print >> sys.stderr, "tblockCount: %d tblockStarts: %s tblockSizes: %s" % (tblockCount, tblockStarts, tblockSizes) | |
146 return (tblockCount, tblockSizes, tblockStarts) | |
147 | |
148 def get_filterd_translations(self, untrimmed=False, filtering=True, ignore_left_bp=0, ignore_right_bp=0, debug=False): | |
149 """ | |
150 [(start, end, seq, blockCount, blockSizes, blockStarts), (start, end, seq, blockCount, blockSizes, blockStarts), (start, end, seq, blockCount, blockSizes, blockStarts)] | |
151 filter: ignore translation if stop codon in first exon after ignore_left_bp | |
152 """ | |
153 translations = [None, None, None, None, None, None] | |
154 seq = self.get_spliced_seq() | |
155 ignore = int((ignore_left_bp if self.strand == '+' else ignore_right_bp) / 3) | |
156 block_sum = sum(self.blockSizes) | |
157 exon_sizes = [x for x in self.blockSizes] | |
158 if self.strand == '-': | |
159 exon_sizes.reverse() | |
160 splice_sites = [int(sum(exon_sizes[:x]) / 3) for x in range(1, len(exon_sizes))] | |
161 if debug: | |
162 sys.stderr.write("splice_sites: %s\n" % splice_sites) | |
163 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0] | |
164 if seq: | |
165 for i in range(3): | |
166 translation = self.get_translation(sequence=seq[i:]) | |
167 if translation: | |
168 tstart = 0 | |
169 tstop = len(translation) | |
170 offset = (block_sum - i) % 3 | |
171 if debug: | |
172 sys.stderr.write("frame: %d\ttstart: %d tstop: %d offset: %d\t%s\n" % (i, tstart, tstop, offset, translation)) | |
173 if not untrimmed: | |
174 tstart = translation.rfind('*', 0, junc) + 1 | |
175 stop = translation.find('*', junc) | |
176 tstop = stop if stop >= 0 else len(translation) | |
177 offset = (block_sum - i) % 3 | |
178 trimmed = translation[tstart:tstop] | |
179 if debug: | |
180 sys.stderr.write("frame: %d\ttstart: %d tstop: %d offset: %d\t%s\n" % (i, tstart, tstop, offset, trimmed)) | |
181 if filtering and tstart > ignore: | |
182 continue | |
183 # get genomic locations for start and end | |
184 if self.strand == '+': | |
185 chromStart = self.chromStart + i + (tstart * 3) | |
186 chromEnd = self.chromEnd - offset - (len(translation) - tstop) * 3 | |
187 else: | |
188 chromStart = self.chromStart + offset + (len(translation) - tstop) * 3 | |
189 chromEnd = self.chromEnd - i - (tstart * 3) | |
190 # get the blocks for this translation | |
191 (tblockCount, tblockSizes, tblockStarts) = self.get_blocks(chromStart, chromEnd) | |
192 translations[i] = (chromStart, chromEnd, trimmed, tblockCount, tblockSizes, tblockStarts) | |
193 if debug: | |
194 sys.stderr.write("tblockCount: %d tblockStarts: %s tblockSizes: %s\n" % (tblockCount, tblockStarts, tblockSizes)) | |
195 # translations[i] = (chromStart, chromEnd, trimmed, tblockCount, tblockSizes, tblockStarts) | |
196 return translations | |
197 | |
198 def get_seq_id(self, seqtype='unk:unk', reference='', frame=None): | |
199 """ | |
200 # Ensembl fasta ID format | |
201 >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT | |
202 >ENSP00000328693 pep:splice chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding | |
203 """ | |
204 frame_name = '' | |
205 chromStart = self.chromStart | |
206 chromEnd = self.chromEnd | |
207 strand = 1 if self.strand == '+' else -1 | |
208 if frame is not None: | |
209 block_sum = sum(self.blockSizes) | |
210 offset = (block_sum - frame) % 3 | |
211 frame_name = '_' + str(frame + 1) | |
212 if self.strand == '+': | |
213 chromStart += frame | |
214 chromEnd -= offset | |
215 else: | |
216 chromStart += offset | |
217 chromEnd -= frame | |
218 location = "chromosome:%s:%s:%s:%s:%s" % (reference, self.chrom, chromStart, chromEnd, strand) | |
219 seq_id = "%s%s %s %s" % (self.name, frame_name, seqtype, location) | |
220 return seq_id | |
221 | |
222 def get_line(self, start_offset=0, end_offset=0): | |
223 if start_offset or end_offset: | |
224 s_offset = start_offset if start_offset else 0 | |
225 e_offset = end_offset if end_offset else 0 | |
226 if s_offset > self.chromStart: | |
227 s_offset = self.chromStart | |
228 chrStart = self.chromStart - s_offset | |
229 chrEnd = self.chromEnd + e_offset | |
230 blkSizes = self.blockSizes | |
231 blkSizes[0] += s_offset | |
232 blkSizes[-1] += e_offset | |
233 blkStarts = self.blockStarts | |
234 for i in range(1, self.blockCount): | |
235 blkStarts[i] += s_offset | |
236 items = [str(x) for x in [self.chrom, chrStart, chrEnd, self.name, self.score, self.strand, self.thickStart, self.thickEnd, self.itemRgb, self.blockCount, ','.join([str(x) for x in blkSizes]), ','.join([str(x) for x in blkStarts])]] | |
237 return '\t'.join(items) + '\n' | |
238 return self.line | |
239 | |
240 | |
241 def __main__(): | |
242 # Parse Command Line | |
243 parser = optparse.OptionParser() | |
244 parser.add_option('-i', '--input', dest='input', help='BED file (tophat junctions.bed) with sequence column added') | |
245 parser.add_option('-o', '--output', dest='output', help='Translations of spliced sequence') | |
246 parser.add_option('-b', '--bed_format', dest='bed_format', action='store_true', default=False, help='Append translations to bed file instead of fasta') | |
247 parser.add_option('-D', '--fa_db', dest='fa_db', default=None, help='Prefix DB identifier for fasta ID line, e.g. generic') | |
248 parser.add_option('-s', '--fa_sep', dest='fa_sep', default='|', help='fasta ID separator defaults to pipe char, e.g. generic|ProtID|description') | |
249 parser.add_option('-B', '--bed', dest='bed', default=None, help='Output a bed file with added 13th column having translation') | |
250 parser.add_option('-G', '--gff3', dest='gff', default=None, help='Output translations to a GFF3 file') | |
251 parser.add_option('-S', '--seqtype', dest='seqtype', default='pep:splice', help='SEQTYPE:STATUS for fasta ID line') | |
252 parser.add_option('-P', '--id_prefix', dest='id_prefix', default='', help='prefix for the sequence ID') | |
253 parser.add_option('-R', '--reference', dest='reference', default=None, help='Genome Reference Name for fasta ID location ') | |
254 parser.add_option('-r', '--refsource', dest='refsource', default=None, help='Source for Genome Reference, e.g. Ensembl, UCSC, or NCBI') | |
255 parser.add_option('-Q', '--score_name', dest='score_name', default=None, help='include in the fasta ID line score_name:score ') | |
256 parser.add_option('-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to ignore when filtering') | |
257 parser.add_option('-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to ignore when filtering') | |
258 parser.add_option('-U', '--unfiltered', dest='filtering', action='store_false', default=True, help='Do NOT filterout translation with stop codon in the first exon') | |
259 parser.add_option('-u', '--untrimmed', dest='untrimmed', action='store_true', default=False, help='Do NOT trim from splice site to stop codon') | |
260 parser.add_option('-L', '--min_length', dest='min_length', type='int', default=None, help='Minimun length (to first stop codon)') | |
261 parser.add_option('-M', '--max_stop_codons', dest='max_stop_codons', type='int', default=None, help='Filter out translations with more than max_stop_codons') | |
262 parser.add_option('-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout') | |
263 (options, args) = parser.parse_args() | |
264 # Input files | |
265 if options.input is not None: | |
266 try: | |
267 inputPath = os.path.abspath(options.input) | |
268 inputFile = open(inputPath, 'r') | |
269 except Exception as e: | |
270 sys.stderr.write("failed: %s\n" % e) | |
271 exit(2) | |
272 else: | |
273 inputFile = sys.stdin | |
274 # Output files | |
275 bed_fh = None | |
276 gff_fh = None | |
277 gff_fa_file = None | |
278 gff_fa = None | |
279 outFile = None | |
280 if options.output is None: | |
281 # write to stdout | |
282 outFile = sys.stdout | |
283 if options.gff: | |
284 gff_fa_file = tempfile.NamedTemporaryFile(prefix='gff_fasta_', suffix=".fa", dir=os.getcwd()).name | |
285 gff_fa = open(gff_fa_file, 'w') | |
286 else: | |
287 try: | |
288 outPath = os.path.abspath(options.output) | |
289 outFile = open(outPath, 'w') | |
290 except Exception as e: | |
291 sys.stderr.write("failed: %s\n" % e) | |
292 exit(3) | |
293 if options.gff: | |
294 gff_fa_file = outPath | |
295 if options.bed: | |
296 bed_fh = open(options.bed, 'w') | |
297 bed_fh.write('track name="%s" description="%s" \n' % ('novel_junctioni_translations', 'test')) | |
298 if options.gff: | |
299 gff_fh = open(options.gff, 'w') | |
300 gff_fh.write("##gff-version 3.2.1\n") | |
301 if options.reference: | |
302 gff_fh.write("##genome-build %s %s\n" % (options.refsource if options.refsource else 'unknown', options.reference)) | |
303 leading_bp = 0 | |
304 trailing_bp = 0 | |
305 if options.leading_bp: | |
306 if options.leading_bp >= 0: | |
307 leading_bp = options.leading_bp | |
308 else: | |
309 sys.stderr.write("failed: leading_bp must be positive\n") | |
310 exit(5) | |
311 if options.trailing_bp: | |
312 if options.trailing_bp >= 0: | |
313 trailing_bp = options.trailing_bp | |
314 else: | |
315 sys.stderr.write("failed: trailing_bp must be positive\n") | |
316 exit(5) | |
317 # Scan bed file | |
30 try: | 318 try: |
31 fields = line.rstrip('\r\n').split('\t') | 319 for i, line in enumerate(inputFile): |
32 (chrom,chromStart,chromEnd,name,score,strand,thickStart,thickEnd,itemRgb,blockCount,blockSizes,blockStarts) = fields[0:12] | 320 if line.startswith('track'): |
33 seq = fields[12] if len(fields) > 12 else None | 321 if outFile and options.bed_format: |
34 self.chrom = chrom | 322 outFile.write(line) |
35 self.chromStart = int(chromStart) | 323 continue |
36 self.chromEnd = int(chromEnd) | 324 entry = BedEntry(line) |
37 self.name = name | 325 strand = 1 if entry.strand == '+' else -1 |
38 self.score = int(score) | 326 translations = entry.get_translations() |
39 self.strand = strand | 327 if options.debug: |
40 self.thickStart = int(thickStart) | 328 exon_seqs = entry.get_exon_seqs() |
41 self.thickEnd = int(thickEnd) | 329 exon_sizes = [len(seq) for seq in exon_seqs] |
42 self.itemRgb = itemRgb | 330 splice_sites = [int(sum(exon_sizes[:x]) / 3) for x in range(1, len(exon_sizes))] |
43 self.blockCount = int(blockCount) | 331 sys.stderr.write("%s\n" % entry.name) |
44 self.blockSizes = [int(x) for x in blockSizes.split(',')] | 332 sys.stderr.write("%s\n" % line.rstrip('\r\n')) |
45 self.blockStarts = [int(x) for x in blockStarts.split(',')] | 333 sys.stderr.write("exons: %s\n" % exon_seqs) |
46 self.seq = seq | 334 sys.stderr.write("%s\n" % splice_sites) |
47 except Exception, e: | 335 for i, translation in enumerate(translations): |
48 print >> sys.stderr, "Unable to read Bed entry" % e | 336 sys.stderr.write("frame %d: %s\n" % (i + 1, translation)) |
49 exit(1) | 337 sys.stderr.write("splice: %s\n" % (''.join(['^' if int(((j * 3) + i) / 3) in splice_sites else '-' for j in range(len(translation))]))) |
50 def __str__(self): | 338 sys.stderr.write("\n") |
51 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s%s' % ( | 339 if options.bed_format: |
52 self.chrom, self.chromStart, self.chromEnd, self.name, self.score, self.strand, self.thickStart, self.thickEnd, self.itemRgb, self.blockCount, | 340 tx_entry = "%s\t%s\n" % (line.rstrip('\r\n'), '\t'.join(translations)) |
53 ','.join([str(x) for x in self.blockSizes]), | 341 outFile.write(tx_entry) |
54 ','.join([str(x) for x in self.blockStarts]), | 342 else: |
55 '\t%s' % self.seq if self.seq else '') | 343 translations = entry.get_filterd_translations(untrimmed=options.untrimmed, filtering=options.filtering, ignore_left_bp=leading_bp, ignore_right_bp=trailing_bp, debug=options.debug) |
56 def get_splice_junctions(self): | 344 for i, tx in enumerate(translations): |
57 splice_juncs = [] | 345 if tx: |
58 for i in range(self.blockCount - 1): | 346 (chromStart, chromEnd, translation, blockCount, blockSizes, blockStarts) = tx |
59 splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) | 347 if options.min_length is not None and len(translation) < options.min_length: |
60 splice_juncs.append(splice_junc) | 348 continue |
61 return splice_juncs | 349 if options.max_stop_codons is not None and translation.count('*') > options.max_stop_codons: |
62 def get_exon_seqs(self): | 350 continue |
63 exons = [] | 351 frame_name = '_%s' % (i + 1) |
64 for i in range(self.blockCount): | 352 pep_id = "%s%s%s" % (options.id_prefix, entry.name, frame_name) |
65 # splice_junc = "%s:%d_%d" % (self.chrom, self.chromStart + self.blockSizes[i], self.chromStart + self.blockStarts[i+1]) | 353 if bed_fh: |
66 exons.append(self.seq[self.blockStarts[i]:self.blockStarts[i] + self.blockSizes[i]]) | 354 bed_fh.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\n' % (str(entry.chrom), chromStart, chromEnd, pep_id, entry.score, entry.strand, chromStart, chromEnd, entry.itemRgb, blockCount, ','.join([str(x) for x in blockSizes]), ','.join([str(x) for x in blockStarts]), translation)) |
67 if self.strand == '-': #reverse complement | 355 location = "chromosome:%s:%s:%s:%s:%s" % (options.reference, entry.chrom, chromStart, chromEnd, strand) |
68 exons.reverse() | 356 score = " %s:%s" % (options.score_name, entry.score) if options.score_name else '' |
69 for i,s in enumerate(exons): | 357 seq_description = "%s %s%s" % (options.seqtype, location, score) |
70 exons[i] = reverse_complement(s) | 358 seq_id = "%s " % pep_id |
71 return exons | 359 if options.fa_db: |
72 def get_spliced_seq(self): | 360 seq_id = "%s%s%s%s" % (options.fa_db, options.fa_sep, pep_id, options.fa_sep) |
73 seq = ''.join(self.get_exon_seqs()) | 361 fa_id = "%s%s" % (seq_id, seq_description) |
74 return seq | 362 fa_entry = ">%s\n%s\n" % (fa_id, translation) |
75 def get_translation(self,sequence=None): | 363 outFile.write(fa_entry) |
76 translation = None | 364 if gff_fh: |
77 seq = sequence if sequence else self.get_spliced_seq() | 365 if gff_fa: |
78 if seq: | 366 gff_fa.write(fa_entry) |
79 seqlen = len(seq) / 3 * 3; | 367 gff_fh.write("##sequence-region %s %d %d\n" % (entry.chrom, chromStart + 1, chromEnd - 1)) |
80 if seqlen >= 3: | 368 gff_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\tID=%s\n" % (entry.chrom, 'splice_junc', 'gene', chromStart + 1, chromEnd - 1, entry.score, entry.strand, 0, pep_id)) |
81 translation = translate(seq[:seqlen]) | 369 for x in range(blockCount): |
82 return translation | 370 start = chromStart + blockStarts[x] + 1 |
83 def get_translations(self): | 371 end = start + blockSizes[x] - 1 |
84 translations = [] | 372 phase = (3 - sum(blockSizes[:x]) % 3) % 3 |
85 seq = self.get_spliced_seq() | 373 gff_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\tParent=%s;ID=%s_%d\n" % (entry.chrom, 'splice_junc', 'CDS', start, end, entry.score, entry.strand, phase, pep_id, pep_id, x)) |
86 if seq: | 374 # ##gff-version 3 |
87 for i in range(3): | 375 # ##sequence-region 19 1 287484 |
88 translation = self.get_translation(sequence=seq[i:]) | 376 # 19 MassSpec peptide 282299 287484 10.0 - 0 ID=TEARLSFYSGHSSFGMYCMVFLALYVQ |
89 if translation: | 377 # 19 MassSpec CDS 287474 287484 . - 0 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812 |
90 translations.append(translation) | 378 # 19 MassSpec CDS 282752 282809 . - 1 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812 |
91 return translations | 379 # 19 MassSpec CDS 282299 282310 . - 0 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812 |
92 ## (start,end) | 380 if bed_fh: |
93 def get_subrange(self,tstart,tstop): | 381 bed_fh.close() |
94 chromStart = self.chromStart | 382 if gff_fh: |
95 chromEnd = self.chromEnd | 383 if gff_fa: |
96 r = range(self.blockCount) | 384 gff_fa.close() |
97 if self.strand == '-': | 385 else: |
98 r.reverse() | 386 outFile.close() |
99 bStart = 0 | 387 gff_fa = open(gff_fa_file, 'r') |
100 for x in r: | 388 gff_fh.write("##FASTA\n") |
101 bEnd = bStart + self.blockSizes[x] | 389 for i, line in enumerate(gff_fa): |
102 if bStart <= tstart < bEnd: | 390 gff_fh.write(line) |
103 if self.strand == '+': | 391 gff_fh.close() |
104 chromStart = self.chromStart + self.blockStarts[x] + (tstart - bStart) | 392 except Exception as e: |
105 else: | 393 sys.stderr.write("failed: Error reading %s - %s\n" % (options.input if options.input else 'stdin', e)) |
106 chromEnd = self.chromStart + self.blockStarts[x] + (tstart - bStart) | 394 raise |
107 if bStart <= tstop < bEnd: | 395 exit(1) |
108 if self.strand == '+': | 396 |
109 chromEnd = self.chromStart + self.blockStarts[x] + (tstop - bStart) | 397 |
110 else: | 398 if __name__ == "__main__": |
111 chromStart = self.chromStart + self.blockStarts[x] + self.blockSizes[x] - (tstop - bStart) | 399 __main__() |
112 bStart += self.blockSizes[x] | |
113 return(chromStart,chromEnd) | |
114 #get the blocks for sub range | |
115 def get_blocks(self,chromStart,chromEnd): | |
116 tblockCount = 0 | |
117 tblockSizes = [] | |
118 tblockStarts = [] | |
119 for x in range(self.blockCount): | |
120 bStart = self.chromStart + self.blockStarts[x] | |
121 bEnd = bStart + self.blockSizes[x] | |
122 if bStart > chromEnd: | |
123 break | |
124 if bEnd < chromStart: | |
125 continue | |
126 cStart = max(chromStart,bStart) | |
127 tblockStarts.append(cStart - chromStart) | |
128 tblockSizes.append(min(chromEnd,bEnd) - cStart) | |
129 tblockCount += 1 | |
130 ## print >> sys.stderr, "tblockCount: %d tblockStarts: %s tblockSizes: %s" % (tblockCount,tblockStarts,tblockSizes) | |
131 return (tblockCount,tblockSizes,tblockStarts) | |
132 ## [(start,end,seq,blockCount,blockSizes,blockStarts),(start,end,seq,blockCount,blockSizes,blockStarts),(start,end,seq,blockCount,blockSizes,blockStarts)] | |
133 ## filter: ignore translation if stop codon in first exon after ignore_left_bp | |
134 def get_filterd_translations(self,untrimmed=False,filtering=True,ignore_left_bp=0,ignore_right_bp=0,debug=False): | |
135 translations = [None,None,None,None,None,None] | |
136 seq = self.get_spliced_seq() | |
137 ignore = (ignore_left_bp if self.strand == '+' else ignore_right_bp) / 3 | |
138 block_sum = sum(self.blockSizes) | |
139 exon_sizes = [x for x in self.blockSizes] | |
140 if self.strand == '-': | |
141 exon_sizes.reverse() | |
142 splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1,len(exon_sizes))] | |
143 if debug: | |
144 print >> sys.stderr, "splice_sites: %s" % splice_sites | |
145 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0] | |
146 if seq: | |
147 for i in range(3): | |
148 translation = self.get_translation(sequence=seq[i:]) | |
149 if translation: | |
150 tstart = 0 | |
151 tstop = len(translation) | |
152 offset = (block_sum - i) % 3 | |
153 if debug: | |
154 print >> sys.stderr, "frame: %d\ttstart: %d tstop: %d offset: %d\t%s" % (i,tstart,tstop,offset,translation) | |
155 if not untrimmed: | |
156 tstart = translation.rfind('*',0,junc) + 1 | |
157 stop = translation.find('*',junc) | |
158 tstop = stop if stop >= 0 else len(translation) | |
159 offset = (block_sum - i) % 3 | |
160 trimmed = translation[tstart:tstop] | |
161 if debug: | |
162 print >> sys.stderr, "frame: %d\ttstart: %d tstop: %d offset: %d\t%s" % (i,tstart,tstop,offset,trimmed) | |
163 if filtering and tstart > ignore: | |
164 continue | |
165 #get genomic locations for start and end | |
166 if self.strand == '+': | |
167 chromStart = self.chromStart + i + (tstart * 3) | |
168 chromEnd = self.chromEnd - offset - (len(translation) - tstop) * 3 | |
169 else: | |
170 chromStart = self.chromStart + offset + (len(translation) - tstop) * 3 | |
171 chromEnd = self.chromEnd - i - (tstart * 3) | |
172 #get the blocks for this translation | |
173 (tblockCount,tblockSizes,tblockStarts) = self.get_blocks(chromStart,chromEnd) | |
174 translations[i] = (chromStart,chromEnd,trimmed,tblockCount,tblockSizes,tblockStarts) | |
175 if debug: | |
176 print >> sys.stderr, "tblockCount: %d tblockStarts: %s tblockSizes: %s" % (tblockCount,tblockStarts,tblockSizes) | |
177 # translations[i] = (chromStart,chromEnd,trimmed,tblockCount,tblockSizes,tblockStarts) | |
178 return translations | |
179 def get_seq_id(self,seqtype='unk:unk',reference='',frame=None): | |
180 ## Ensembl fasta ID format | |
181 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT | |
182 # >ENSP00000328693 pep:splice chromosome:NCBI35:1:904515:910768:1 gene:ENSG00000158815:transcript:ENST00000328693 gene_biotype:protein_coding transcript_biotype:protein_coding | |
183 frame_name = '' | |
184 chromStart = self.chromStart | |
185 chromEnd = self.chromEnd | |
186 strand = 1 if self.strand == '+' else -1 | |
187 if frame != None: | |
188 block_sum = sum(self.blockSizes) | |
189 offset = (block_sum - frame) % 3 | |
190 frame_name = '_' + str(frame + 1) | |
191 if self.strand == '+': | |
192 chromStart += frame | |
193 chromEnd -= offset | |
194 else: | |
195 chromStart += offset | |
196 chromEnd -= frame | |
197 location = "chromosome:%s:%s:%s:%s:%s" % (reference,self.chrom,chromStart,chromEnd,strand) | |
198 seq_id = "%s%s %s %s" % (self.name,frame_name,seqtype,location) | |
199 return seq_id | |
200 def get_line(self, start_offset = 0, end_offset = 0): | |
201 if start_offset or end_offset: | |
202 s_offset = start_offset if start_offset else 0 | |
203 e_offset = end_offset if end_offset else 0 | |
204 if s_offset > self.chromStart: | |
205 s_offset = self.chromStart | |
206 chrStart = self.chromStart - s_offset | |
207 chrEnd = self.chromEnd + e_offset | |
208 blkSizes = self.blockSizes | |
209 blkSizes[0] += s_offset | |
210 blkSizes[-1] += e_offset | |
211 blkStarts = self.blockStarts | |
212 for i in range(1,self.blockCount): | |
213 blkStarts[i] += s_offset | |
214 items = [str(x) for x in [self.chrom,chrStart,chrEnd,self.name,self.score,self.strand,self.thickStart,self.thickEnd,self.itemRgb,self.blockCount,','.join([str(x) for x in blkSizes]),','.join([str(x) for x in blkStarts])]] | |
215 return '\t'.join(items) + '\n' | |
216 return self.line | |
217 | |
218 def __main__(): | |
219 #Parse Command Line | |
220 parser = optparse.OptionParser() | |
221 parser.add_option( '-i', '--input', dest='input', help='BED file (tophat junctions.bed) with sequence column added' ) | |
222 parser.add_option( '-o', '--output', dest='output', help='Translations of spliced sequence') | |
223 parser.add_option( '-b', '--bed_format', dest='bed_format', action='store_true', default=False, help='Append translations to bed file instead of fasta' ) | |
224 parser.add_option( '-D', '--fa_db', dest='fa_db', default=None, help='Prefix DB identifier for fasta ID line, e.g. generic' ) | |
225 parser.add_option( '-s', '--fa_sep', dest='fa_sep', default='|', help='fasta ID separator defaults to pipe char, e.g. generic|ProtID|description' ) | |
226 parser.add_option( '-B', '--bed', dest='bed', default=None, help='Output a bed file with added 13th column having translation' ) | |
227 parser.add_option( '-G', '--gff3', dest='gff', default=None, help='Output translations to a GFF3 file' ) | |
228 parser.add_option( '-S', '--seqtype', dest='seqtype', default='pep:splice', help='SEQTYPE:STATUS for fasta ID line' ) | |
229 parser.add_option( '-P', '--id_prefix', dest='id_prefix', default='', help='prefix for the sequence ID' ) | |
230 parser.add_option( '-R', '--reference', dest='reference', default=None, help='Genome Reference Name for fasta ID location ' ) | |
231 parser.add_option( '-r', '--refsource', dest='refsource', default=None, help='Source for Genome Reference, e.g. Ensembl, UCSC, or NCBI' ) | |
232 parser.add_option( '-Q', '--score_name', dest='score_name', default=None, help='include in the fasta ID line score_name:score ' ) | |
233 parser.add_option( '-l', '--leading_bp', dest='leading_bp', type='int', default=None, help='leading number of base pairs to ignore when filtering' ) | |
234 parser.add_option( '-t', '--trailing_bp', dest='trailing_bp', type='int', default=None, help='trailing number of base pairs to ignore when filtering' ) | |
235 parser.add_option( '-U', '--unfiltered', dest='filtering', action='store_false', default=True, help='Do NOT filterout translation with stop codon in the first exon' ) | |
236 parser.add_option( '-u', '--untrimmed', dest='untrimmed', action='store_true', default=False, help='Do NOT trim from splice site to stop codon' ) | |
237 parser.add_option( '-L', '--min_length', dest='min_length', type='int', default=None, help='Minimun length (to first stop codon)' ) | |
238 parser.add_option( '-M', '--max_stop_codons', dest='max_stop_codons', type='int', default=None, help='Filter out translations with more than max_stop_codons' ) | |
239 parser.add_option( '-d', '--debug', dest='debug', action='store_true', default=False, help='Turn on wrapper debugging to stdout' ) | |
240 (options, args) = parser.parse_args() | |
241 # Input files | |
242 if options.input != None: | |
243 try: | |
244 inputPath = os.path.abspath(options.input) | |
245 inputFile = open(inputPath, 'r') | |
246 except Exception, e: | |
247 print >> sys.stderr, "failed: %s" % e | |
248 exit(2) | |
249 else: | |
250 inputFile = sys.stdin | |
251 # Output files | |
252 bed_fh = None | |
253 gff_fh = None | |
254 gff_fa_file = None | |
255 gff_fa = None | |
256 outFile = None | |
257 if options.output == None: | |
258 #write to stdout | |
259 outFile = sys.stdout | |
260 if options.gff: | |
261 gff_fa_file = tempfile.NamedTemporaryFile(prefix='gff_fasta_',suffix=".fa",dir=os.getcwd()).name | |
262 gff_fa = open(gff_fa_file,'w') | |
263 else: | |
264 try: | |
265 outPath = os.path.abspath(options.output) | |
266 outFile = open(outPath, 'w') | |
267 except Exception, e: | |
268 print >> sys.stderr, "failed: %s" % e | |
269 exit(3) | |
270 if options.gff: | |
271 gff_fa_file = outPath | |
272 if options.bed: | |
273 bed_fh = open(options.bed,'w') | |
274 bed_fh.write('track name="%s" description="%s" \n' % ('novel_junctioni_translations','test')) | |
275 if options.gff: | |
276 gff_fh = open(options.gff,'w') | |
277 gff_fh.write("##gff-version 3.2.1\n") | |
278 if options.reference: | |
279 gff_fh.write("##genome-build %s %s\n" % (options.refsource if options.refsource else 'unknown', options.reference)) | |
280 leading_bp = 0 | |
281 trailing_bp = 0 | |
282 if options.leading_bp: | |
283 if options.leading_bp >= 0: | |
284 leading_bp = options.leading_bp | |
285 else: | |
286 print >> sys.stderr, "failed: leading_bp must be positive" | |
287 exit(5) | |
288 if options.trailing_bp: | |
289 if options.trailing_bp >= 0: | |
290 trailing_bp = options.trailing_bp | |
291 else: | |
292 print >> sys.stderr, "failed: trailing_bp must be positive" | |
293 exit(5) | |
294 # Scan bed file | |
295 try: | |
296 for i, line in enumerate( inputFile ): | |
297 if line.startswith('track'): | |
298 if outFile and options.bed_format: | |
299 outFile.write(line) | |
300 continue | |
301 entry = BedEntry(line) | |
302 strand = 1 if entry.strand == '+' else -1 | |
303 translations = entry.get_translations() | |
304 if options.debug: | |
305 exon_seqs = entry.get_exon_seqs() | |
306 exon_sizes = [len(seq) for seq in exon_seqs] | |
307 splice_sites = [sum(exon_sizes[:x]) / 3 for x in range(1,len(exon_sizes))] | |
308 print >> sys.stderr, entry.name | |
309 print >> sys.stderr, line.rstrip('\r\n') | |
310 print >> sys.stderr, "exons: %s" % exon_seqs | |
311 print >> sys.stderr, "%s" % splice_sites | |
312 for i,translation in enumerate(translations): | |
313 print >> sys.stderr, "frame %d: %s" % (i+1,translation) | |
314 print >> sys.stderr, "splice: %s" % (''.join(['^' if (((j*3)+i)/3) in splice_sites else '-' for j in range(len(translation))])) | |
315 print >> sys.stderr, "" | |
316 if options.bed_format: | |
317 tx_entry = "%s\t%s\n" % (line.rstrip('\r\n'),'\t'.join(translations)) | |
318 outFile.write(tx_entry) | |
319 else: | |
320 translations = entry.get_filterd_translations(untrimmed=options.untrimmed,filtering=options.filtering,ignore_left_bp=leading_bp,ignore_right_bp=trailing_bp,debug=options.debug) | |
321 for i,tx in enumerate(translations): | |
322 if tx: | |
323 (chromStart,chromEnd,translation,blockCount,blockSizes,blockStarts) = tx | |
324 if options.min_length != None and len(translation) < options.min_length: | |
325 continue | |
326 if options.max_stop_codons != None and translation.count('*') > options.max_stop_codons: | |
327 continue | |
328 frame_name = '_%s' % (i + 1) | |
329 pep_id = "%s%s%s" % (options.id_prefix,entry.name,frame_name) | |
330 if bed_fh: | |
331 bed_fh.write('%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s\t%s\n' % (str(entry.chrom),chromStart,chromEnd,pep_id,entry.score,entry.strand,chromStart,chromEnd,entry.itemRgb,blockCount,','.join([str(x) for x in blockSizes]),','.join([str(x) for x in blockStarts]),translation)) | |
332 location = "chromosome:%s:%s:%s:%s:%s" % (options.reference,entry.chrom,chromStart,chromEnd,strand) | |
333 score = " %s:%s" % (options.score_name,entry.score) if options.score_name else '' | |
334 seq_description = "%s %s%s" % (options.seqtype, location, score) | |
335 seq_id = "%s " % pep_id | |
336 if options.fa_db: | |
337 seq_id = "%s%s%s%s" % (options.fa_db,options.fa_sep,pep_id,options.fa_sep) | |
338 fa_id = "%s%s" % (seq_id,seq_description) | |
339 fa_entry = ">%s\n%s\n" % (fa_id,translation) | |
340 outFile.write(fa_entry) | |
341 if gff_fh: | |
342 if gff_fa: | |
343 gff_fa.write(fa_entry) | |
344 gff_fh.write("##sequence-region %s %d %d\n" % (entry.chrom,chromStart + 1,chromEnd - 1)) | |
345 gff_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\tID=%s\n" % (entry.chrom,'splice_junc','gene',chromStart + 1,chromEnd - 1,entry.score,entry.strand,0,pep_id)) | |
346 for x in range(blockCount): | |
347 start = chromStart+blockStarts[x] + 1 | |
348 end = start + blockSizes[x] - 1 | |
349 phase = (3 - sum(blockSizes[:x]) % 3) % 3 | |
350 gff_fh.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d\tParent=%s;ID=%s_%d\n" % (entry.chrom,'splice_junc','CDS',start,end,entry.score,entry.strand,phase,pep_id,pep_id,x)) | |
351 """ | |
352 ##gff-version 3 | |
353 ##sequence-region 19 1 287484 | |
354 19 MassSpec peptide 282299 287484 10.0 - 0 ID=TEARLSFYSGHSSFGMYCMVFLALYVQ | |
355 19 MassSpec CDS 287474 287484 . - 0 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812 | |
356 19 MassSpec CDS 282752 282809 . - 1 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812 | |
357 19 MassSpec CDS 282299 282310 . - 0 Parent=TEARLSFYSGHSSFGMYCMVFLALYVQ;transcript_id=ENST00000269812 | |
358 """ | |
359 if bed_fh: | |
360 bed_fh.close() | |
361 if gff_fh: | |
362 if gff_fa: | |
363 gff_fa.close() | |
364 else: | |
365 outFile.close() | |
366 gff_fa = open(gff_fa_file,'r') | |
367 gff_fh.write("##FASTA\n") | |
368 for i, line in enumerate(gff_fa): | |
369 gff_fh.write(line) | |
370 gff_fh.close() | |
371 except Exception, e: | |
372 print >> sys.stderr, "failed: Error reading %s - %s" % (options.input if options.input else 'stdin',e) | |
373 | |
374 if __name__ == "__main__" : __main__() | |
375 |