comparison bedutil.py @ 0:da1b538b87e5 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/proteogenomics/retrieve_ensembl_bed commit 88cf1e923a8c9e5bc6953ad412d15a7c70f054d1
author galaxyp
date Mon, 22 Jan 2018 13:13:47 -0500 (2018-01-22)
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:da1b538b87e5
1 #!/usr/bin/env python
2 """
3 #
4 #------------------------------------------------------------------------------
5 # University of Minnesota
6 # Copyright 2016, Regents of the University of Minnesota
7 #------------------------------------------------------------------------------
8 # Author:
9 #
10 # James E Johnson
11 #
12 #------------------------------------------------------------------------------
13 """
14
15 from __future__ import print_function
16
17 import sys
18
19 from Bio.Seq import reverse_complement, translate
20
21
22 def bed_from_line(line, ensembl=False, seq_column=None):
23 fields = line.rstrip('\r\n').split('\t')
24 if len(fields) < 12:
25 return None
26 (chrom, chromStart, chromEnd, name, score, strand,
27 thickStart, thickEnd, itemRgb,
28 blockCount, blockSizes, blockStarts) = fields[0:12]
29 bed_entry = BedEntry(chrom=chrom, chromStart=chromStart, chromEnd=chromEnd,
30 name=name, score=score, strand=strand,
31 thickStart=thickStart, thickEnd=thickEnd,
32 itemRgb=itemRgb,
33 blockCount=blockCount,
34 blockSizes=blockSizes.rstrip(','),
35 blockStarts=blockStarts.rstrip(','))
36 if seq_column is not None and -len(fields) <= seq_column < len(fields):
37 bed_entry.seq = fields[seq_column]
38 if ensembl and len(fields) >= 20:
39 bed_entry.second_name = fields[12]
40 bed_entry.cds_start_status = fields[13]
41 bed_entry.cds_end_status = fields[14]
42 bed_entry.exon_frames = fields[15].rstrip(',')
43 bed_entry.biotype = fields[16]
44 bed_entry.gene_name = fields[17]
45 bed_entry.second_gene_name = fields[18]
46 bed_entry.gene_type = fields[19]
47 return bed_entry
48
49
50 def as_int_list(obj):
51 if obj is None:
52 return None
53 if isinstance(obj, list):
54 return [int(x) for x in obj]
55 elif isinstance(obj, str):
56 return [int(x) for x in obj.split(',')]
57 else: # python2 unicode?
58 return [int(x) for x in str(obj).split(',')]
59
60
61 class BedEntry(object):
62 def __init__(self, chrom=None, chromStart=None, chromEnd=None,
63 name=None, score=None, strand=None,
64 thickStart=None, thickEnd=None, itemRgb=None,
65 blockCount=None, blockSizes=None, blockStarts=None):
66 self.chrom = chrom
67 self.chromStart = int(chromStart)
68 self.chromEnd = int(chromEnd)
69 self.name = name
70 self.score = int(score) if score is not None else 0
71 self.strand = '-' if str(strand).startswith('-') else '+'
72 self.thickStart = int(thickStart) if thickStart else self.chromStart
73 self.thickEnd = int(thickEnd) if thickEnd else self.chromEnd
74 self.itemRgb = str(itemRgb) if itemRgb is not None else r'100,100,100'
75 self.blockCount = int(blockCount)
76 self.blockSizes = as_int_list(blockSizes)
77 self.blockStarts = as_int_list(blockStarts)
78 self.second_name = None
79 self.cds_start_status = None
80 self.cds_end_status = None
81 self.exon_frames = None
82 self.biotype = None
83 self.gene_name = None
84 self.second_gene_name = None
85 self.gene_type = None
86 self.seq = None
87 self.cdna = None
88 self.pep = None
89 # T26C
90 self.aa_change = []
91 # p.Trp26Cys g.<pos><ref>><alt> # g.1304573A>G
92 self.variants = []
93
94 def __str__(self):
95 return '%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%s\t%d\t%s\t%s' % (
96 self.chrom, self.chromStart, self.chromEnd,
97 self.name, self.score, self.strand,
98 self.thickStart, self.thickEnd, str(self.itemRgb), self.blockCount,
99 ','.join([str(x) for x in self.blockSizes]),
100 ','.join([str(x) for x in self.blockStarts]))
101
102 def get_splice_junctions(self):
103 splice_juncs = []
104 for i in range(self.blockCount - 1):
105 splice_junc = "%s:%d_%d"\
106 % (self.chrom,
107 self.chromStart + self.blockSizes[i],
108 self.chromStart + self.blockStarts[i+1])
109 splice_juncs.append(splice_junc)
110 return splice_juncs
111
112 def get_exon_seqs(self):
113 if not self.seq:
114 return None
115 exons = []
116 for i in range(self.blockCount):
117 exons.append(self.seq[self.blockStarts[i]:self.blockStarts[i]
118 + self.blockSizes[i]])
119 if self.strand == '-': # reverse complement
120 exons.reverse()
121 for i, s in enumerate(exons):
122 exons[i] = reverse_complement(s)
123 return exons
124
125 def get_spliced_seq(self, strand=None):
126 if not self.seq:
127 return None
128 seq = ''.join(self.get_exon_seqs())
129 if strand and self.strand != strand:
130 seq = reverse_complement(seq)
131 return seq
132
133 def get_cdna(self):
134 if not self.cdna:
135 self.cdna = self.get_spliced_seq()
136 return self.cdna
137
138 def get_cds(self):
139 cdna = self.get_cdna()
140 if cdna:
141 if self.chromStart == self.thickStart\
142 and self.chromEnd == self.thickEnd:
143 return cdna
144 pos = [self.cdna_offset_of_pos(self.thickStart),
145 self.cdna_offset_of_pos(self.thickEnd)]
146 if 0 <= min(pos) <= max(pos) <= len(cdna):
147 return cdna[min(pos):max(pos)]
148 return None
149
150 def set_cds(self, cdna_start, cdna_end):
151 cdna_len = sum(self.blockSizes)
152 if 0 <= cdna_start < cdna_end <= cdna_len:
153 cds_pos = [self.pos_of_cdna_offet(cdna_start),
154 self.pos_of_cdna_offet(cdna_end)]
155 if all(cds_pos):
156 self.thickStart = min(cds_pos)
157 self.thickEnd = max(cds_pos)
158 return self
159 return None
160
161 def trim_cds(self, basepairs):
162 if self.chromStart <= self.thickStart < self.thickEnd <= self.chromEnd:
163 cds_pos = [self.cdna_offset_of_pos(self.thickStart),
164 self.cdna_offset_of_pos(self.thickEnd)]
165 if basepairs > 0:
166 return self.set_cds(min(cds_pos) + basepairs, max(cds_pos))
167 else:
168 return self.set_cds(min(cds_pos), max(cds_pos) + basepairs)
169 return None
170
171 def get_cds_bed(self):
172 cds_pos = [self.cdna_offset_of_pos(self.thickStart),
173 self.cdna_offset_of_pos(self.thickEnd)]
174 return self.trim(min(cds_pos), max(cds_pos))
175
176 def get_cigar(self):
177 cigar = ''
178 r = range(self.blockCount)
179 xl = None
180 for x in r:
181 if xl is not None:
182 intronSize = abs(self.blockStarts[x] - self.blockSizes[xl]
183 - self.blockStarts[xl])
184 cigar += '%dN' % intronSize
185 cigar += '%dM' % self.blockSizes[x]
186 xl = x
187 return cigar
188
189 def get_cigar_md(self):
190 cigar = ''
191 md = ''
192 r = range(self.blockCount)
193 xl = None
194 for x in r:
195 if xl is not None:
196 intronSize = abs(self.blockStarts[x] - self.blockSizes[xl]
197 - self.blockStarts[xl])
198 cigar += '%dN' % intronSize
199 cigar += '%dM' % self.blockSizes[x]
200 xl = x
201 md = '%d' % sum(self.blockSizes)
202 return (cigar, md)
203
204 def get_translation(self, sequence=None):
205 translation = None
206 seq = sequence if sequence else self.get_spliced_seq()
207 if seq:
208 seqlen = len(seq) / 3 * 3
209 if seqlen >= 3:
210 translation = translate(seq[:seqlen])
211 return translation
212
213 def get_translations(self):
214 translations = []
215 seq = self.get_spliced_seq()
216 if seq:
217 for i in range(3):
218 translation = self.get_translation(sequence=seq[i:])
219 if translation:
220 translations.append(translation)
221 return translations
222
223 def pos_of_cdna_offet(self, offset):
224 if offset is not None and 0 <= offset < sum(self.blockSizes):
225 r = list(range(self.blockCount))
226 rev = self.strand == '-'
227 if rev:
228 r.reverse()
229 nlen = 0
230 for x in r:
231 if offset < nlen + self.blockSizes[x]:
232 if rev:
233 return self.chromStart + self.blockStarts[x]\
234 + self.blockSizes[x] - (offset - nlen)
235 else:
236 return self.chromStart + self.blockStarts[x]\
237 + (offset - nlen)
238 nlen += self.blockSizes[x]
239 return None
240
241 def cdna_offset_of_pos(self, pos):
242 if not self.chromStart <= pos < self.chromEnd:
243 return -1
244 r = list(range(self.blockCount))
245 rev = self.strand == '-'
246 if rev:
247 r.reverse()
248 nlen = 0
249 for x in r:
250 bStart = self.chromStart + self.blockStarts[x]
251 bEnd = bStart + self.blockSizes[x]
252 if bStart <= pos < bEnd:
253 return nlen + (bEnd - pos if rev else pos - bStart)
254 nlen += self.blockSizes[x]
255
256 def apply_variant(self, pos, ref, alt):
257 pos = int(pos)
258 if not ref or not alt:
259 print("variant requires ref and alt sequences", file=sys.stderr)
260 return
261 if not self.chromStart <= pos <= self.chromEnd:
262 print("variant not in entry %s: %s %d < %d < %d" %
263 (self.name, self.strand,
264 self.chromStart, pos, self.chromEnd),
265 file=sys.stderr)
266 print("%s" % str(self), file=sys.stderr)
267 return
268 if len(ref) != len(alt):
269 print("variant only works for snp: %s %s" % (ref, alt),
270 file=sys.stderr)
271 return
272 if not self.seq:
273 print("variant entry %s has no seq" % self.name, file=sys.stderr)
274 return
275 """
276 if self.strand == '-':
277 ref = reverse_complement(ref)
278 alt = reverse_complement(alt)
279 """
280 bases = list(self.seq)
281 offset = pos - self.chromStart
282 for i in range(len(ref)):
283 # offset = self.cdna_offset_of_pos(pos+i)
284 if offset is not None:
285 bases[offset+i] = alt[i]
286 else:
287 print("variant offset %s: %s %d < %d < %d" %
288 (self.name, self.strand, self.chromStart,
289 pos+1, self.chromEnd), file=sys.stderr)
290 print("%s" % str(self), file=sys.stderr)
291 self.seq = ''.join(bases)
292 self.variants.append("g.%d%s>%s" % (pos+1, ref, alt))
293
294 def get_variant_bed(self, pos, ref, alt):
295 pos = int(pos)
296 if not ref or not alt:
297 print("variant requires ref and alt sequences", file=sys.stderr)
298 return None
299 if not self.chromStart <= pos <= self.chromEnd:
300 print("variant not in entry %s: %s %d < %d < %d" %
301 (self.name, self.strand,
302 self.chromStart, pos, self.chromEnd),
303 file=sys.stderr)
304 print("%s" % str(self), file=sys.stderr)
305 return None
306 if not self.seq:
307 print("variant entry %s has no seq" % self.name, file=sys.stderr)
308 return None
309 tbed = BedEntry(chrom=self.chrom,
310 chromStart=self.chromStart, chromEnd=self.chromEnd,
311 name=self.name, score=self.score, strand=self.strand,
312 thickStart=self.chromStart, thickEnd=self.chromEnd,
313 itemRgb=self.itemRgb,
314 blockCount=self.blockCount,
315 blockSizes=self.blockSizes,
316 blockStarts=self.blockStarts)
317 bases = list(self.seq)
318 offset = pos - self.chromStart
319 tbed.seq = ''.join(bases[:offset] + list(alt)
320 + bases[offset+len(ref):])
321 if len(ref) != len(alt):
322 diff = len(alt) - len(ref)
323 rEnd = pos + len(ref)
324 # need to adjust blocks
325 # change spans blocks,
326 for x in range(tbed.blockCount):
327 bStart = tbed.chromStart + tbed.blockStarts[x]
328 bEnd = bStart + tbed.blockSizes[x]
329 # change within a block or extends (last block)
330 # adjust blocksize
331 # seq: GGGcatGGG
332 # ref c alt tag: GGGtagatGGG
333 # ref cat alt a: GGGaGGG
334 if bStart <= pos < rEnd < bEnd:
335 tbed.blockSizes[x] += diff
336 return tbed
337
338 # (start, end)
339 def get_subrange(self, tstart, tstop, debug=False):
340 chromStart = self.chromStart
341 chromEnd = self.chromEnd
342 if debug:
343 print("%s" % (str(self)), file=sys.stderr)
344 r = list(range(self.blockCount))
345 if self.strand == '-':
346 r.reverse()
347 bStart = 0
348 bEnd = 0
349 for x in r:
350 bEnd = bStart + self.blockSizes[x]
351 if bStart <= tstart < bEnd:
352 if self.strand == '+':
353 chromStart = self.chromStart + self.blockStarts[x] +\
354 (tstart - bStart)
355 else:
356 chromEnd = self.chromStart + self.blockStarts[x] +\
357 self.blockSizes[x] - (tstart - bStart)
358 if bStart <= tstop < bEnd:
359 if self.strand == '+':
360 chromEnd = self.chromStart + self.blockStarts[x] +\
361 (tstop - bStart)
362 else:
363 chromStart = self.chromStart + self.blockStarts[x] +\
364 self.blockSizes[x] - (tstop - bStart)
365 if debug:
366 print("%3d %s\t%d\t%d\t%d\t%d\t%d\t%d" %
367 (x, self.strand, bStart, bEnd,
368 tstart, tstop, chromStart, chromEnd), file=sys.stderr)
369 bStart += self.blockSizes[x]
370 return(chromStart, chromEnd)
371
372 # get the blocks for sub range
373 def get_blocks(self, chromStart, chromEnd):
374 tblockCount = 0
375 tblockSizes = []
376 tblockStarts = []
377 for x in range(self.blockCount):
378 bStart = self.chromStart + self.blockStarts[x]
379 bEnd = bStart + self.blockSizes[x]
380 if bStart > chromEnd:
381 break
382 if bEnd < chromStart:
383 continue
384 cStart = max(chromStart, bStart)
385 tblockStarts.append(cStart - chromStart)
386 tblockSizes.append(min(chromEnd, bEnd) - cStart)
387 tblockCount += 1
388 return (tblockCount, tblockSizes, tblockStarts)
389
390 def trim(self, tstart, tstop, debug=False):
391 (tchromStart, tchromEnd) =\
392 self.get_subrange(tstart, tstop, debug=debug)
393 (tblockCount, tblockSizes, tblockStarts) =\
394 self.get_blocks(tchromStart, tchromEnd)
395 tbed = BedEntry(
396 chrom=self.chrom, chromStart=tchromStart, chromEnd=tchromEnd,
397 name=self.name, score=self.score, strand=self.strand,
398 thickStart=tchromStart, thickEnd=tchromEnd, itemRgb=self.itemRgb,
399 blockCount=tblockCount,
400 blockSizes=tblockSizes, blockStarts=tblockStarts)
401 if self.seq:
402 ts = tchromStart-self.chromStart
403 te = tchromEnd - tchromStart + ts
404 tbed.seq = self.seq[ts:te]
405 return tbed
406
407 def get_filtered_translations(self, untrimmed=False, filtering=True,
408 ignore_left_bp=0, ignore_right_bp=0,
409 debug=False):
410 translations = [None, None, None]
411 seq = self.get_spliced_seq()
412 ignore = (ignore_left_bp if self.strand == '+'
413 else ignore_right_bp) / 3
414 block_sum = sum(self.blockSizes)
415 exon_sizes = [x for x in self.blockSizes]
416 if self.strand == '-':
417 exon_sizes.reverse()
418 splice_sites = [sum(exon_sizes[:x]) / 3
419 for x in range(1, len(exon_sizes))]
420 if debug:
421 print("splice_sites: %s" % splice_sites, file=sys.stderr)
422 junc = splice_sites[0] if len(splice_sites) > 0 else exon_sizes[0]
423 if seq:
424 for i in range(3):
425 translation = self.get_translation(sequence=seq[i:])
426 if translation:
427 tstart = 0
428 tstop = len(translation)
429 offset = (block_sum - i) % 3
430 if debug:
431 print("frame: %d\ttstart: %d tstop: %d " +
432 "offset: %d\t%s" %
433 (i, tstart, tstop, offset, translation),
434 file=sys.stderr)
435 if not untrimmed:
436 tstart = translation.rfind('*', 0, junc) + 1
437 stop = translation.find('*', junc)
438 tstop = stop if stop >= 0 else len(translation)
439 offset = (block_sum - i) % 3
440 trimmed = translation[tstart:tstop]
441 if debug:
442 print("frame: %d\ttstart: %d tstop: %d " +
443 "offset: %d\t%s" %
444 (i, tstart, tstop, offset, trimmed),
445 file=sys.stderr)
446 if filtering and tstart > ignore:
447 continue
448 # get genomic locations for start and end
449 if self.strand == '+':
450 chromStart = self.chromStart + i + (tstart * 3)
451 chromEnd = self.chromEnd - offset\
452 - (len(translation) - tstop) * 3
453 else:
454 chromStart = self.chromStart + offset\
455 + (len(translation) - tstop) * 3
456 chromEnd = self.chromEnd - i - (tstart * 3)
457 # get the blocks for this translation
458 (tblockCount, tblockSizes, tblockStarts) =\
459 self.get_blocks(chromStart, chromEnd)
460 translations[i] = (chromStart, chromEnd, trimmed,
461 tblockCount, tblockSizes, tblockStarts)
462 if debug:
463 print("tblockCount: %d tblockStarts: %s " +
464 "tblockSizes: %s" %
465 (tblockCount, tblockStarts, tblockSizes),
466 file=sys.stderr)
467 return translations
468
469 def get_seq_id(self, seqtype='unk:unk', reference='', frame=None):
470 # Ensembl fasta ID format
471 # >ID SEQTYPE:STATUS LOCATION GENE TRANSCRIPT
472 # >ENSP00000328693 pep:splice chromosome:NCBI35:1:904515:910768:1\
473 # gene:ENSG00000158815:transcript:ENST00000328693\
474 # gene_biotype:protein_coding transcript_biotype:protein_coding
475 frame_name = ''
476 chromStart = self.chromStart
477 chromEnd = self.chromEnd
478 strand = 1 if self.strand == '+' else -1
479 if frame is not None:
480 block_sum = sum(self.blockSizes)
481 offset = (block_sum - frame) % 3
482 frame_name = '_' + str(frame + 1)
483 if self.strand == '+':
484 chromStart += frame
485 chromEnd -= offset
486 else:
487 chromStart += offset
488 chromEnd -= frame
489 location = "chromosome:%s:%s:%s:%s:%s"\
490 % (reference, self.chrom, chromStart, chromEnd, strand)
491 seq_id = "%s%s %s %s" % (self.name, frame_name, seqtype, location)
492 return seq_id
493
494 def get_line(self, start_offset=0, end_offset=0):
495 if start_offset or end_offset:
496 s_offset = start_offset if start_offset else 0
497 e_offset = end_offset if end_offset else 0
498 if s_offset > self.chromStart:
499 s_offset = self.chromStart
500 chrStart = self.chromStart - s_offset
501 chrEnd = self.chromEnd + e_offset
502 blkSizes = self.blockSizes
503 blkSizes[0] += s_offset
504 blkSizes[-1] += e_offset
505 blkStarts = self.blockStarts
506 for i in range(1, self.blockCount):
507 blkStarts[i] += s_offset
508 items = [str(x) for x in [self.chrom, chrStart, chrEnd, self.name,
509 self.score, self.strand, self.thickStart,
510 self.thickEnd, self.itemRgb,
511 self.blockCount,
512 ','.join([str(x) for x in blkSizes]),
513 ','.join([str(x) for x in blkStarts])]]
514 return '\t'.join(items) + '\n'
515 return self.line