comparison ngsutils/bam/__init__.py @ 0:4e4e4093d65d draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author iuc
date Wed, 11 Nov 2015 13:04:07 -0500
parents
children 7a68005de299
comparison
equal deleted inserted replaced
-1:000000000000 0:4e4e4093d65d
1 import sys
2 import os
3 import re
4 import pysam
5 try:
6 from eta import ETA
7 except:
8 pass
9 import ngsutils.support
10
11
12 def bam_open(fname, mode='r', *args, **kwargs):
13 if fname.lower()[-4:] == '.bam':
14 return pysam.Samfile(fname, '%sb' % mode, *args, **kwargs)
15 return pysam.Samfile(fname, '%s' % mode, *args, **kwargs)
16
17
18 def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None):
19 if not quiet and bam.filename:
20 eta = ETA(os.stat(bam.filename).st_size)
21 else:
22 eta = None
23
24 for pileup in bam.pileup(mask=mask):
25 pos = bam.tell()
26 bgz_offset = pos >> 16
27
28 if not quiet:
29 if callback:
30 eta.print_status(bgz_offset, extra=callback(pileup))
31 else:
32 eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos))
33
34 yield pileup
35
36 if eta:
37 eta.done()
38
39
40 def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None):
41 '''
42 >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)]
43 ['A', 'B', 'E', 'C', 'D', 'F', 'Z']
44 '''
45
46 if os.path.exists('%s.bai' % bam.filename):
47 # This is an indexed file, so it is ref sorted...
48 # Meaning that we should show chrom:pos, instead of read names
49 show_ref_pos = True
50
51 eta = None
52
53 if not ref:
54 if not quiet and bam.filename:
55 eta = ETA(os.stat(bam.filename).st_size)
56
57 for read in bam:
58 pos = bam.tell()
59 bgz_offset = pos >> 16
60
61 if not quiet and eta:
62 if callback:
63 eta.print_status(bgz_offset, extra=callback(read))
64 elif (show_ref_pos):
65 if read.tid > -1:
66 eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))
67 else:
68 eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname))
69 else:
70 eta.print_status(bgz_offset, extra='%s' % read.qname)
71
72 yield read
73
74 else:
75 working_chrom = None
76 if ref in bam.references:
77 working_chrom = ref
78 elif ref[0:3] == 'chr':
79 # compensate for Ensembl vs UCSC ref naming
80 if ref[3:] in bam.references:
81 working_chrom = ref[3:]
82
83 if not working_chrom:
84 raise ValueError('Missing reference: %s' % ref)
85
86 tid = bam.gettid(working_chrom)
87
88 if not start:
89 start = 0
90 if not end:
91 end = bam.lengths[tid]
92
93 if not quiet and bam.filename:
94 eta = ETA(end - start)
95
96 for read in bam.fetch(working_chrom, start, end):
97 if not quiet and eta:
98 if callback:
99 eta.print_status(read.pos - start, extra=callback(read))
100 else:
101 eta.print_status(read.pos - start, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname))
102
103 yield read
104
105 if eta:
106 eta.done()
107
108
109 def bam_batch_reads(bam, quiet=False):
110 '''
111 Batch mapping for the same reads (qname) together, this way
112 they can all be compared/converted together.
113 '''
114 reads = []
115 last = None
116 for read in bam_iter(bam, quiet=quiet):
117 if last and read.qname != last:
118 yield reads
119 reads = []
120 last = read.qname
121 reads.append(read)
122
123 if reads:
124 yield reads
125
126
127 bam_cigar = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X']
128 bam_cigar_op = {
129 'M': 0,
130 'I': 1,
131 'D': 2,
132 'N': 3,
133 'S': 4,
134 'H': 5,
135 'P': 6,
136 '=': 7,
137 'X': 8,
138 }
139
140
141 def cigar_fromstr(s):
142 '''
143 >>> cigar_fromstr('10M5I20M')
144 [(0, 10), (1, 5), (0, 20)]
145 >>> cigar_fromstr('1M1I1D1N1S1H1P')
146 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]
147 '''
148 ret = []
149 spl = re.split('([0-9]+)', s)[1:]
150 for op, size in zip(spl[1::2], spl[::2]):
151 ret.append((bam_cigar_op[op], int(size)))
152 return ret
153
154
155 def cigar_tostr(cigar):
156 '''
157 >>> cigar_tostr(((0, 10), (1, 5), (0, 20)))
158 '10M5I20M'
159 >>> cigar_tostr(((0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)))
160 '1M1I1D1N1S1H1P'
161 '''
162
163 s = ''
164
165 for op, size in cigar:
166 s += '%s%s' % (size, bam_cigar[op])
167
168 return s
169
170
171 def cigar_read_len(cigar):
172 '''
173 >>> cigar_read_len(cigar_fromstr('8M'))
174 8
175 >>> cigar_read_len(cigar_fromstr('8M100N8M'))
176 16
177 >>> cigar_read_len(cigar_fromstr('8M10I8M'))
178 26
179 >>> cigar_read_len(cigar_fromstr('8M10D8M'))
180 16
181 '''
182
183 read_pos = 0
184
185 for op, length in cigar:
186 if op == 0: # M
187 read_pos += length
188 elif op == 1: # I
189 read_pos += length
190 elif op == 2: # D
191 pass
192 elif op == 3: # N
193 pass
194 elif op == 4: # S
195 read_pos += length
196 elif op == 5: # H
197 pass
198 elif op == 7: # =
199 read_pos += length
200 elif op == 8: # X
201 read_pos += length
202 else:
203 raise ValueError("Unsupported CIGAR operation: %s" % op)
204
205 return read_pos
206
207
208 def read_calc_mismatches(read):
209 inserts = 0
210 deletions = 0
211 indels = 0
212 edits = int(read.opt('NM'))
213 #
214 # NM counts the length of indels
215 # We really just care about *if* there is an indel, not the size
216 #
217
218 for op, length in read.cigar:
219 if op == 1:
220 inserts += length
221 indels += 1
222 elif op == 2:
223 deletions += length
224 indels += 1
225
226 return edits - inserts - deletions + indels
227
228
229 def _extract_md_matches(md, maxlength):
230 md_pos = 0
231
232 while md and md_pos < maxlength:
233 tmp = '0' # preload a zero so that immediate mismatches will be caught
234 # the zero will have no affect otherwise...
235
236 # look for matches
237 while md and md[0] in '0123456789':
238 tmp += md[0]
239 md = md[1:]
240
241 pos = int(tmp)
242 if pos > maxlength:
243 return (maxlength, '%s%s' % (pos - maxlength, md))
244 return (pos, md)
245
246
247 def read_calc_variations(read):
248 'see _read_calc_variations'
249 for tup in _read_calc_variations(read.pos, read.cigar, read.opt('MD'), read.seq):
250 yield tup
251
252
253 def _read_calc_variations(start_pos, cigar, md, seq):
254 '''
255 For each variation, outputs a tuple: (op, pos, seq)
256
257 op - operation (0 = mismatch, 1 = insert, 2 = deletion) (like CIGAR)
258 pos - 0-based position of the variation (relative to reference)
259 seq - the base (or bases) involved in the variation
260 for mismatch or insert, this is the sequence inserted
261 for deletions, this is the reference sequence that was removed
262
263 MD is the mismatch string. Not all aligners include the tag. If your aligner
264 doesn't include this, then you'll need to add it, or use a different function
265 (see: read_calc_mismatches_gen).
266
267 Special care must be used to handle RNAseq reads that cross
268 an exon-exon junction.
269
270 Also: MD is a *really* dumb format that can't be read correctly with
271 a regex. It must be processed in concert with the CIGAR alignment
272 in order to catch all edge cases. Some implementations insert 0's
273 at the end of inserts / deltions / variations to make parsing easier
274 but not everyone follows this. Look at the complex examples: the
275 CIGAR alignment may show an insert, but the MD just shows all matches.
276
277 Examples: See: http://davetang.org/muse/2011/01/28/perl-and-sam/
278 Also from CCBB actual mappings and manual altered (shortened,
279 made more complex)
280 (doctests included)
281
282 Match/mismatch
283 CIGAR: 36M
284 MD:Z: 1A0C0C0C1T0C0T27
285 MD:Z: 1ACCC1TCT27 (alternative)
286 1 2
287 123456789012345678901234567890123456
288 ref: CGATACGGGGACATCCGGCCTGCTCCTTCTCACATG
289 XXXX XXX
290 read: CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG
291 MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
292 -ACCC-TCT---------------------------
293 >>> list(_read_calc_variations(1, [(0,36)], '1A0C0C0C1T0C0T27', 'CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG'))
294 [(0, 2, 'A'), (0, 3, 'C'), (0, 4, 'C'), (0, 5, 'C'), (0, 7, 'T'), (0, 8, 'C'), (0, 9, 'T')]
295
296 Insert
297 CIGAR: 6M1I29M
298 MD:Z: 0C1C0C1C0T0C27
299 C1CC1CTC27 (alt)
300 1 2
301 123456^789012345678901234567890123456
302 ref: CACCCC^TCTGACATCCGGCCTGCTCCTTCTCACAT
303 X XX X|XX
304 read: GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT
305 MMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
306 G-GA-GGGG---------------------------
307 >>> list(_read_calc_variations(1, [(0,6), (1,1), (0, 29)], '0C1C0C1C0T0C27', 'GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT'))
308 [(0, 1, 'G'), (0, 3, 'G'), (0, 4, 'A'), (0, 6, 'G'), (1, 7, 'G'), (0, 7, 'G'), (0, 8, 'G')]
309 >>> list(_read_calc_variations(1, [(0,6), (1,1), (0, 29)], 'C1CC1CTC27', 'GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT'))
310 [(0, 1, 'G'), (0, 3, 'G'), (0, 4, 'A'), (0, 6, 'G'), (1, 7, 'G'), (0, 7, 'G'), (0, 8, 'G')]
311
312
313 Deletion
314 CIGAR: 9M9D27M
315 MD:Z: 2G0A5^ATGATGTCA27
316 2GA5^ATGATGTCA27 (alt)
317 ref: AGGAATGGGATGATGTCAGGGGTTCCAGGTGGAGACGAGGACTCC
318 XX ^^^^^^^^^
319 read: AGTGATGGG^^^^^^^^^GGGGTTCCAGGTGGAGACGAGGACTCC
320 MMMMMMMMMDDDDDDDDDMMMMMMMMMMMMMMMMMMMMMMMMMMM
321 --TG-----ATGATGTCA---------------------------
322 >>> list(_read_calc_variations(1, [(0,9), (2,9), (0, 27)], '2G0A5^ATGATGTCA27', 'AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC'))
323 [(0, 3, 'T'), (0, 4, 'G'), (2, 10, 'ATGATGTCA')]
324
325
326 Complex
327 CIGAR: 9M9D11M1I15M
328 MD:Z: 2G0A5^ATGATGTCAA26
329 MD:Z: 2G0A5^ATGATGTCA0G26 (alt)
330 1 2 3 4
331 pos: 123456789012345678901234567890123456789012345
332 ref: AGGAATGGGATGATGTCAGGGGTTCCAGG^GGAGACGAGGACTCC
333 XX ^^^^^^^^^X |
334 read: AGTGATGGG^^^^^^^^^AGGGTTCCAGGTGGAGACGAGGACTCC
335 MMMMMMMMMDDDDDDDDDMMMMMMMMMMMMMMMMMMMMMMMMMMM
336 --TG-----ATGATGTCAG----------T---------------
337 >>> list(_read_calc_variations(1, [(0,9), (2,9), (0,11), (1,1), (0,15)], '2G0A5^ATGATGTCAA26', 'AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC'))
338 [(0, 3, 'T'), (0, 4, 'G'), (2, 10, 'ATGATGTCA'), (0, 19, 'G'), (1, 30, 'T')]
339
340
341 Complex example - inserts aren't separately handled by MD, only visible in CIGAR
342 CIGAR: 14M2D16M3I42M
343 MD:Z: 14^TC58
344 1 2 3 4 5 6 7
345 pos: 12345678901234567890123456789012^^^345678901234567890123456789012345678901234567
346 ref: caagtatcaccatgtcaggcatttttttcatt^^^tttgtagagagagaagacttgctatgttgcccaagctggcct
347 ^^ |||
348 read: CAAGTATCACCATG^^AGGCATTTTTTTCATTTGGTTTGTAGAGAGAGAAGACTTGCTATGTTGCCCAAGCTGGCCT
349 MMMMMMMMMMMMMMDDMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
350 --------------tc----------------TGG------------------------------------------
351 >>> list(_read_calc_variations(1, [(0,14), (2,2), (0,16), (1,3), (0,42)], '14^TC58', 'CAAGTATCACCATGAGGCATTTTTTTCATTTGGTTTGTAGAGAGAGAAGACTTGCTATGTTGCCCAAGCTGGCCT'))
352 [(2, 15, 'TC'), (1, 33, 'TGG')]
353
354
355 Complex example 2:
356 CIGAR: 41M3I10M1I5M1I2M2I10M
357 MD:Z: 44C2C6T6T6
358 1 2 3 4 5 6
359 pos: 12345678901234567890123456789012345678901^^^2345678901^23456^78^^9012345678
360 ref: AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG
361 ||| X X | X | || X
362 read: AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG
363 MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM
364 -----------------------------------------tta---A--T---c---A----gt---G------
365
366
367 13M 28M 3I 10M 1I 5M 1I 2M 2I 10M
368 >>> list(_read_calc_variations(1, [(0, 41), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '44C2C6T6T6', 'AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG'))
369 [(1, 42, 'TTA'), (0, 45, 'A'), (0, 48, 'T'), (1, 52, 'C'), (0, 55, 'A'), (1, 57, 'G'), (1, 59, 'GT'), (0, 62, 'G')]
370
371
372 Splice junction example:
373 CIGAR: 62M100N13M
374 MD:Z: 2T27C44
375 1 1
376 1 2 3 4 5 6 6 7
377 pos: 12345678901234567890123456789012345678901234567890123456789012| [100] |3456789012345
378 ref: CCTCATGACCAGCTTGTTGAAGAGATCCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCA|-------|CTTGCTCCTGCTC
379 X X
380 read: CCGCATGACCAGCTTGTTGAAGAGATCCGATATCAAGTGCCCACCTTGGCTCGTGGCTCTCA|-------|CTTGCTCCTGCTC
381 --G---------------------------T-----------------------------------------------------
382
383 >>> list(_read_calc_variations(1, [(0,62), (4,100), (0,13)], '2T27C44', 'CCGCATGACCAGCTTGTTGAAGAGATCCGATATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTC'))
384 [(0, 3, 'G'), (0, 31, 'T')]
385
386
387 Splice junction example 2:
388 CIGAR: 13M100N28M3I10M1I5M1I2M2I10M
389 MD:Z: 44C2C6T6T6
390 1 1 1 1 1
391 1 2 3 4 5 6
392 pos: 1234567890123| [100] |4567890123456789012345678901^^^2345678901^23456^78^^9012345678
393 ref: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG
394 ||| X X | X | || X
395 read: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG
396 MMMMMMMMMMMMM MMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM
397 ------------- ----------------------------tta---A--T---c---A----gt---G------
398
399 13M 100N 28M 3I 10M 1I 5M 1I 2M 2I 10M
400 >>> list(_read_calc_variations(1, [(0, 13), (3, 100), (0, 28), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '44C2C6T6T6', 'AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG'))
401 [(1, 142, 'TTA'), (0, 145, 'A'), (0, 148, 'T'), (1, 152, 'C'), (0, 155, 'A'), (1, 157, 'G'), (1, 159, 'GT'), (0, 162, 'G')]
402
403
404 Splice junction example 2A:
405 CIGAR: 13M100N7M2D19M3I10M1I5M1I2M2I10M
406 MD:Z: 9A10^GG22C2C6T6T6
407 1 1 1 1 1
408 1 2 3 4 5 6
409 pos: 1234567890123| [100] |4567890123456789012345678901^^^2345678901^23456^78^^9012345678
410 ref: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG
411 ^^ ||| X X | X | || X
412 read: AGGGTGGCGCGAT|-------|CGATGAC^^CATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG
413 MMMMMMMMMMMMM MMMMMMMDDMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM
414 ---------C--- ----------------------------tta---A--T---c---A----gt---G------
415 .........A... .......GG................... ...C..C... ...T. .. ...T......
416 9 A 10 ^GG 22 C 2C 6 T 6 T 6
417
418 >>> list(_read_calc_variations(1, [(0, 13), (3, 100), (0, 7), (2, 2), (0, 19), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '9A10^GG22C2C6T6T6', 'AGGGTGGCGCGATCGATGACCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG'))
419 [(0, 10, 'C'), (2, 121, 'GG'), (1, 142, 'TTA'), (0, 145, 'A'), (0, 148, 'T'), (1, 152, 'C'), (0, 155, 'A'), (1, 157, 'G'), (1, 159, 'GT'), (0, 162, 'G')]
420
421 Real Example
422 242_1071_1799_B1
423 CIGAR: 42M10I3M1D9M1D11M
424 MD:Z: 27G16A0^T6C2^T1C9
425 1 2 3 4 5 6 7
426 pos: 123456789012345678901234567890123456789012 345678901234567890123456789012345
427 ref: ACTGAGAAACCCAACCCTCTGAGACCAGCACACCCCTTTCAA^^^^^^^^^^GCATGTTCCTCCCTCCCCTTCTTTG
428 X X^ X ^ X
429 read: ACTGAGAAACCCAACCCTCTGAGACCAACACACCCCTTTCAACACATTTTTGGCC^GTTCCTGCC^CGCCTTCTTTG
430 MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIIIIIIIIMMMDMMMMMMMMMDMMMMMMMMMMM
431 ---------------------------A--------------^^^^^^^^^^--CT------G--T-G---------
432
433 >>> list(_read_calc_variations(1, [(0,42), (1,10), (0, 3), (2, 1), (0, 9), (2, 1), (0, 11)], '27G16A0^T6C2^T1C9', 'ACTGAGAAACCCAACCCTCTGAGACCAACACACCCCTTTCAACACATTTTTGGCCGTTCCTGCCCGCCTTCTTTG', ))
434 [(0, 28, 'A'), (1, 43, 'CACATTTTTG'), (0, 45, 'C'), (2, 46, 'T'), (0, 53, 'G'), (2, 56, 'T'), (0, 58, 'G')]
435
436
437 Real example 2
438 577_1692_891_A1
439 CIGAR: 34M100N39M2I
440 MD:Z: 3T69
441 1 1 1 1
442 1 2 3 4 5 6 7
443 pos: 1234567890123456789012345678901234| [100] |567890123456789012345678901234567890123
444 ref: GGATTCTTCCCACTGGGTCGATGTTGTTTGTGAT|-------|CTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTC^^
445
446 read: GGAATCTTCCCACTGGGTCGATGTTGTTTGTGAT|-------|CTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTCTC
447 MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM NNNNN MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMII
448 ---A------------------------------ ---------------------------------------TC
449
450 >>> list(_read_calc_variations(1, [(0,34), (3,100), (0, 39), (1, 2)], '3T69', 'GGAATCTTCCCACTGGGTCGATGTTGTTTGTGATCTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTCTC', ))
451 [(0, 4, 'A'), (1, 174, 'TC')]
452
453 '''
454
455 ref_pos = start_pos
456 read_pos = 0
457
458 for op, length in cigar:
459 if md and md[0] == '0':
460 md = md[1:]
461 # sys.stderr.write('%s, %s, %s\n' %(op, length, md))
462 if op == 0: # M
463 # how far in the chunk are we? (do *not* update ref_pos until end)
464 md_pos = 0
465 last = None
466 while md and md_pos < length:
467 if last == (op, length, md):
468 sys.stderr.write('\nInfinite loop in variant finding!\nPos: %s\nCIGAR: (%s, %s)\n' % (ref_pos, op, length))
469 sys.exit(1)
470 last = (op, length, md)
471 # sys.stderr.write('%s, %s, %s\n' %(op, length, md))
472 chunk_size, md = _extract_md_matches(md, length - md_pos)
473 # sys.stderr.write(' -> %s, %s\n' %(chunk_size, md))
474 md_pos += chunk_size
475
476 # look for mismatches
477 while md_pos < length and md and md[0] not in '0123456789^':
478 yield (op, ref_pos + md_pos, seq[read_pos + md_pos])
479 md = md[1:]
480
481 md_pos += 1
482
483 ref_pos += length
484 read_pos += length
485
486 elif op == 1: # I
487 # nothing in MD about inserts...
488 yield (op, ref_pos, seq[read_pos:read_pos + length])
489 read_pos += length
490
491 elif op == 2: # D
492 # prefixed with '^' and includes all of the removed bases
493 if md[0] == '^':
494 md = md[1:]
495 yield (op, ref_pos, md[:length])
496 md = md[length:]
497 ref_pos += length
498
499 elif op == 3: # N
500 ref_pos += length
501
502
503 def read_calc_mismatches_gen(ref, read, chrom):
504 start = read.pos
505 ref_pos = 0
506 read_pos = 0
507
508 for op, length in read.cigar:
509 if op == 1:
510 yield ref_pos, op, None
511 read_pos += length
512 elif op == 2:
513 yield ref_pos, op, None
514 ref_pos += length
515 elif op == 3:
516 ref_pos += length
517 elif op == 0:
518 refseq = ref.fetch(chrom, start + ref_pos, start + ref_pos + length)
519 if not refseq:
520 raise ValueError("Reference '%s' not found in FASTA file: %s" % (chrom, ref.filename))
521 cur_pos = start + ref_pos
522 for refbase, readbase in zip(refseq.upper(), read.seq[read_pos:read_pos + length].upper()):
523 if refbase != readbase:
524 yield op, cur_pos, readbase
525 cur_pos += 1
526 ref_pos += length
527 read_pos += length
528 else:
529 raise ValueError("Unsupported CIGAR operation: %s" % op)
530
531
532 def read_calc_mismatches_ref(ref, read, chrom):
533 edits = 0
534
535 for op, pos, base in read_calc_mismatches_gen(ref, read, chrom):
536 edits += 1
537
538 return edits
539
540
541 __region_cache = {}
542
543
544 def region_pos_to_genomic_pos(name, start, cigar):
545 '''
546 converts a junction position to a genomic location given a junction
547 ref name, the junction position, and the cigar alignment.
548
549 returns: (genomic ref, genomic pos, genomic cigar)
550
551 >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050,3000-4000', 25, [(0, 100)])
552 ('chr1', 1025, [(0, 25), (3, 950), (0, 50), (3, 950), (0, 25)])
553
554 >>> region_pos_to_genomic_pos('chr1:1000-1050,1050-1200', 25, [(0, 100)])
555 ('chr1', 1025, [(0, 25), (3, 0), (0, 75)])
556
557 >>> region_pos_to_genomic_pos('chr3R:17630851-17630897,17634338-17634384', 17, [(0, 39)])
558 ('chr3R', 17630868, [(0, 29), (3, 3441), (0, 10)])
559
560 >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(4, 25), (0, 50), (4, 25)])
561 ('chr1', 1025, [(4, 25), (0, 25), (3, 950), (0, 25), (4, 25)])
562
563 >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(5, 25), (0, 75)])
564 ('chr1', 1025, [(5, 25), (0, 25), (3, 950), (0, 50)])
565
566 >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(5, 25), (0, 75)])
567 ('chr1', 1025, [(5, 25), (0, 25), (3, 950), (0, 50)])
568
569 >>> region_pos_to_genomic_pos('chr7:16829153-16829246,16829246-16829339', 62, cigar_fromstr('83M18S'))
570 ('chr7', 16829215, [(0, 31), (3, 0), (0, 52), (4, 18)])
571
572
573 '''
574
575 if name in __region_cache:
576 chrom, fragments = __region_cache[name]
577 else:
578 c1 = name.split(':')
579 chrom = c1[0]
580
581 fragments = []
582 for fragment in c1[1].split(','):
583 s, e = fragment.split('-')
584 fragments.append((int(s), int(e)))
585
586 __region_cache[name] = (chrom, fragments)
587
588 chr_cigar = []
589 chr_start = fragments[0][0]
590
591 read_start = int(start)
592
593 frag_idx = 0
594 frag_start = 0
595 frag_end = 0
596
597 for i, (s, e) in enumerate(fragments):
598 if chr_start + read_start < e:
599 chr_start += read_start
600 frag_idx = i
601 frag_start = s
602 frag_end = e
603 break
604
605 else:
606 chr_start += (e - s)
607 read_start -= (e - s)
608
609 cur_pos = chr_start
610
611 for op, length in cigar:
612 if op in [1, 4, 5]:
613 chr_cigar.append((op, length))
614
615 elif op in [0, 2]:
616 if cur_pos + length <= frag_end:
617 cur_pos += length
618 chr_cigar.append((op, length))
619
620 else:
621 while cur_pos + length > frag_end:
622 if frag_end - cur_pos > 0:
623 chr_cigar.append((op, frag_end - cur_pos))
624 length -= (frag_end - cur_pos)
625 cur_pos = frag_end
626 frag_idx += 1
627 if len(fragments) <= frag_idx:
628 print 'ERROR converting: ', name, fragments
629 return (chrom, 0, chr_cigar)
630 frag_start, frag_end = fragments[frag_idx]
631 chr_cigar.append((3, frag_start - cur_pos))
632 cur_pos = frag_start
633
634 cur_pos = cur_pos + length
635 chr_cigar.append((op, length))
636 else:
637 print "Unsupported CIGAR operation (%s)" % bam_cigar[op]
638 sys.exit(1)
639
640 return (chrom, chr_start, chr_cigar)
641
642
643 def is_junction_valid(cigar, min_overlap=4):
644 '''
645 Does the genomic cigar alignment represent a 'good' alignment.
646 Used for checking junction->genome alignments
647
648 1) the alignment must not start at a splice junction
649 2) the alignment must not start or end with an overhang
650 3) the alignment must overhang the splice junction by min_overlap (4)
651
652 | Exon1 | Intron | Exon2 |
653 |-----------------|oooooooooooooooo|------------------|
654 XXXXXXXXXXXXXXXXXXXXXXXX (bad 1)
655 XXXXXXXXXXXXX (bad 2) XXXXXXXXXXXXXXXX (bad 2)
656 XX-----------------XXXXXXXXXXXXXXXXX (bad 3)
657
658 >>> is_junction_valid(cigar_fromstr('1000N40M'))
659 (False, 'Starts at gap (1000N40M)')
660
661 >>> is_junction_valid(cigar_fromstr('100M'))
662 (False, "Doesn't cover junction")
663
664 >>> is_junction_valid(cigar_fromstr('100M1000N3M'), 4)
665 (False, "Too short overlap at 3' (100M1000N3M)")
666
667 >>> is_junction_valid(cigar_fromstr('2M1000N100M'), 4)
668 (False, "Too short overlap at 5' (2M1000N100M)")
669
670 >>> is_junction_valid(cigar_fromstr('4M1000N100M'), 4)
671 (True, '')
672
673 >>> is_junction_valid(cigar_fromstr('100M1000N4M'), 4)
674 (True, '')
675
676 >>> is_junction_valid(cigar_fromstr('4M0N100M'), 4)
677 (True, '')
678
679 '''
680 first = True
681 pre_gap = True
682
683 pre_gap_count = 0
684 post_gap_count = 0
685
686 has_gap = False
687
688 for op, length in cigar:
689 # mapping can't start at a gap
690 if first and op == 3:
691 return (False, 'Starts at gap (%s)' % cigar_tostr(cigar))
692 first = False
693
694 if op == 3:
695 pre_gap = False
696 post_gap_count = 0
697 has_gap = True
698
699 elif pre_gap:
700 pre_gap_count += length
701 else:
702 post_gap_count += length
703
704 # mapping must start with more than min_overlap base match
705
706 if not has_gap:
707 return (False, "Doesn't cover junction")
708 elif pre_gap_count < min_overlap:
709 return (False, "Too short overlap at 5' (%s)" % cigar_tostr(cigar))
710 elif post_gap_count < min_overlap:
711 return (False, "Too short overlap at 3' (%s)" % cigar_tostr(cigar))
712
713 return True, ''
714
715
716 def read_alignment_fragments_gen(read):
717 '''
718 Takes a read and returns the start and end positions for each match/mismatch region.
719 This will let us know where each read alignment "touches" the genome.
720 '''
721
722 for start, end in _read_alignment_fragments_gen(read.pos, read.cigar):
723 yield (start, end)
724
725
726 def _read_alignment_fragments_gen(pos, cigar):
727 ''' Test-able version of read_alignment_fragments_gen
728 >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('50M')))
729 [(1, 51)]
730
731 >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('25M100N25M')))
732 [(1, 26), (126, 151)]
733
734 >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('20M1D4M100N10M5I10M')))
735 [(1, 21), (22, 26), (126, 136), (136, 146)]
736 '''
737 ref_pos = pos
738
739 for op, length in cigar:
740 if op == 0:
741 yield (ref_pos, ref_pos + length)
742 ref_pos += length
743 elif op == 1:
744 pass
745 elif op == 2:
746 ref_pos += length
747 elif op == 3:
748 ref_pos += length
749 else:
750 raise ValueError("Unsupported CIGAR operation: %s" % op)
751
752
753 def read_cigar_at_pos(cigar, qpos, is_del):
754 '''
755 Returns the CIGAR operation for a given read position
756
757 qpos is the 0-based index of the base in the read
758 '''
759 pos = 0
760 returnnext = False
761 for op, length in cigar:
762 if returnnext:
763 return op
764 if op == 0:
765 pos += length
766 elif op == 1:
767 pos += length
768 elif op == 2:
769 pass
770 elif op == 3:
771 pass
772 elif op == 4:
773 pos += length
774 elif op == 5:
775 pass
776 else:
777 raise ValueError("Unsupported CIGAR operation: %s" % op)
778
779 if pos > qpos:
780 return op
781 if is_del and pos == qpos:
782 returnnext = True
783
784 return None
785
786
787 def cleancigar(cigar):
788 '''
789 Cleans a CIGAR alignment to remove zero length operations
790
791 >>> cigar_tostr(cleancigar(cigar_fromstr('31M0N52M18S')))
792 '83M18S'
793
794 '''
795 newcigar = []
796
797 changed = False
798 zero = False
799
800 for op, size in cigar:
801 if size > 0:
802 if zero and newcigar:
803 if newcigar[-1][0] == op:
804 newsize = newcigar[-1][1] + size
805 newcigar[-1] = (op, newsize)
806 zero = 0
807 else:
808 newcigar.append((op, size))
809 else:
810 changed = True
811 zero = True
812
813 if changed:
814 return newcigar
815
816 return None
817
818
819 def read_cleancigar(read):
820 '''
821 Replaces the CIGAR string for a read to remove any operations that are zero length.
822
823 This happens to the read in-place
824 '''
825 if read.is_unmapped:
826 return False
827
828 newcigar = []
829
830 newcigar = cleancigar(read.cigar)
831
832 if newcigar:
833 read.cigar = newcigar
834 return True
835
836 return False
837
838
839 def read_to_unmapped(read, ref=None):
840 '''
841 Converts a read from mapped to unmapped.
842
843 Sets the 'ZR' tag to indicate the original ref/pos/cigar (if ref is passed)
844 '''
845
846 newread = pysam.AlignedRead()
847
848 if ref:
849 tags = [('ZR', '%s:%s:%s' % (ref, read.pos, cigar_tostr(read.cigar)))]
850
851 newread.is_unmapped = True
852 newread.mapq = 0
853 newread.tlen = 0
854 newread.pos = -1
855 newread.pnext = -1
856 newread.rnext = -1
857 newread.tid = -1
858
859 newread.qname = read.qname
860
861 if read.is_paired:
862 newread.is_paired = True
863
864 if not read.is_unmapped and read.is_reverse:
865 newread.seq = ngsutils.support.revcomp(read.seq)
866 newread.qual = read.qual[::-1]
867 else:
868 newread.seq = read.seq
869 newread.qual = read.qual
870
871 newread.tags = tags
872
873 return newread
874
875
876
877 if __name__ == '__main__':
878 import doctest
879 doctest.testmod()