Mercurial > repos > iuc > ngsutils_bam_filter
comparison ngsutils/bam/__init__.py @ 0:4e4e4093d65d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 09194687c74a424732f8b0c017cbb942aad89068
author | iuc |
---|---|
date | Wed, 11 Nov 2015 13:04:07 -0500 |
parents | |
children | 7a68005de299 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4e4e4093d65d |
---|---|
1 import sys | |
2 import os | |
3 import re | |
4 import pysam | |
5 try: | |
6 from eta import ETA | |
7 except: | |
8 pass | |
9 import ngsutils.support | |
10 | |
11 | |
12 def bam_open(fname, mode='r', *args, **kwargs): | |
13 if fname.lower()[-4:] == '.bam': | |
14 return pysam.Samfile(fname, '%sb' % mode, *args, **kwargs) | |
15 return pysam.Samfile(fname, '%s' % mode, *args, **kwargs) | |
16 | |
17 | |
18 def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None): | |
19 if not quiet and bam.filename: | |
20 eta = ETA(os.stat(bam.filename).st_size) | |
21 else: | |
22 eta = None | |
23 | |
24 for pileup in bam.pileup(mask=mask): | |
25 pos = bam.tell() | |
26 bgz_offset = pos >> 16 | |
27 | |
28 if not quiet: | |
29 if callback: | |
30 eta.print_status(bgz_offset, extra=callback(pileup)) | |
31 else: | |
32 eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos)) | |
33 | |
34 yield pileup | |
35 | |
36 if eta: | |
37 eta.done() | |
38 | |
39 | |
40 def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None): | |
41 ''' | |
42 >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)] | |
43 ['A', 'B', 'E', 'C', 'D', 'F', 'Z'] | |
44 ''' | |
45 | |
46 if os.path.exists('%s.bai' % bam.filename): | |
47 # This is an indexed file, so it is ref sorted... | |
48 # Meaning that we should show chrom:pos, instead of read names | |
49 show_ref_pos = True | |
50 | |
51 eta = None | |
52 | |
53 if not ref: | |
54 if not quiet and bam.filename: | |
55 eta = ETA(os.stat(bam.filename).st_size) | |
56 | |
57 for read in bam: | |
58 pos = bam.tell() | |
59 bgz_offset = pos >> 16 | |
60 | |
61 if not quiet and eta: | |
62 if callback: | |
63 eta.print_status(bgz_offset, extra=callback(read)) | |
64 elif (show_ref_pos): | |
65 if read.tid > -1: | |
66 eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname)) | |
67 else: | |
68 eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname)) | |
69 else: | |
70 eta.print_status(bgz_offset, extra='%s' % read.qname) | |
71 | |
72 yield read | |
73 | |
74 else: | |
75 working_chrom = None | |
76 if ref in bam.references: | |
77 working_chrom = ref | |
78 elif ref[0:3] == 'chr': | |
79 # compensate for Ensembl vs UCSC ref naming | |
80 if ref[3:] in bam.references: | |
81 working_chrom = ref[3:] | |
82 | |
83 if not working_chrom: | |
84 raise ValueError('Missing reference: %s' % ref) | |
85 | |
86 tid = bam.gettid(working_chrom) | |
87 | |
88 if not start: | |
89 start = 0 | |
90 if not end: | |
91 end = bam.lengths[tid] | |
92 | |
93 if not quiet and bam.filename: | |
94 eta = ETA(end - start) | |
95 | |
96 for read in bam.fetch(working_chrom, start, end): | |
97 if not quiet and eta: | |
98 if callback: | |
99 eta.print_status(read.pos - start, extra=callback(read)) | |
100 else: | |
101 eta.print_status(read.pos - start, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname)) | |
102 | |
103 yield read | |
104 | |
105 if eta: | |
106 eta.done() | |
107 | |
108 | |
109 def bam_batch_reads(bam, quiet=False): | |
110 ''' | |
111 Batch mapping for the same reads (qname) together, this way | |
112 they can all be compared/converted together. | |
113 ''' | |
114 reads = [] | |
115 last = None | |
116 for read in bam_iter(bam, quiet=quiet): | |
117 if last and read.qname != last: | |
118 yield reads | |
119 reads = [] | |
120 last = read.qname | |
121 reads.append(read) | |
122 | |
123 if reads: | |
124 yield reads | |
125 | |
126 | |
127 bam_cigar = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X'] | |
128 bam_cigar_op = { | |
129 'M': 0, | |
130 'I': 1, | |
131 'D': 2, | |
132 'N': 3, | |
133 'S': 4, | |
134 'H': 5, | |
135 'P': 6, | |
136 '=': 7, | |
137 'X': 8, | |
138 } | |
139 | |
140 | |
141 def cigar_fromstr(s): | |
142 ''' | |
143 >>> cigar_fromstr('10M5I20M') | |
144 [(0, 10), (1, 5), (0, 20)] | |
145 >>> cigar_fromstr('1M1I1D1N1S1H1P') | |
146 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)] | |
147 ''' | |
148 ret = [] | |
149 spl = re.split('([0-9]+)', s)[1:] | |
150 for op, size in zip(spl[1::2], spl[::2]): | |
151 ret.append((bam_cigar_op[op], int(size))) | |
152 return ret | |
153 | |
154 | |
155 def cigar_tostr(cigar): | |
156 ''' | |
157 >>> cigar_tostr(((0, 10), (1, 5), (0, 20))) | |
158 '10M5I20M' | |
159 >>> cigar_tostr(((0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1))) | |
160 '1M1I1D1N1S1H1P' | |
161 ''' | |
162 | |
163 s = '' | |
164 | |
165 for op, size in cigar: | |
166 s += '%s%s' % (size, bam_cigar[op]) | |
167 | |
168 return s | |
169 | |
170 | |
171 def cigar_read_len(cigar): | |
172 ''' | |
173 >>> cigar_read_len(cigar_fromstr('8M')) | |
174 8 | |
175 >>> cigar_read_len(cigar_fromstr('8M100N8M')) | |
176 16 | |
177 >>> cigar_read_len(cigar_fromstr('8M10I8M')) | |
178 26 | |
179 >>> cigar_read_len(cigar_fromstr('8M10D8M')) | |
180 16 | |
181 ''' | |
182 | |
183 read_pos = 0 | |
184 | |
185 for op, length in cigar: | |
186 if op == 0: # M | |
187 read_pos += length | |
188 elif op == 1: # I | |
189 read_pos += length | |
190 elif op == 2: # D | |
191 pass | |
192 elif op == 3: # N | |
193 pass | |
194 elif op == 4: # S | |
195 read_pos += length | |
196 elif op == 5: # H | |
197 pass | |
198 elif op == 7: # = | |
199 read_pos += length | |
200 elif op == 8: # X | |
201 read_pos += length | |
202 else: | |
203 raise ValueError("Unsupported CIGAR operation: %s" % op) | |
204 | |
205 return read_pos | |
206 | |
207 | |
208 def read_calc_mismatches(read): | |
209 inserts = 0 | |
210 deletions = 0 | |
211 indels = 0 | |
212 edits = int(read.opt('NM')) | |
213 # | |
214 # NM counts the length of indels | |
215 # We really just care about *if* there is an indel, not the size | |
216 # | |
217 | |
218 for op, length in read.cigar: | |
219 if op == 1: | |
220 inserts += length | |
221 indels += 1 | |
222 elif op == 2: | |
223 deletions += length | |
224 indels += 1 | |
225 | |
226 return edits - inserts - deletions + indels | |
227 | |
228 | |
229 def _extract_md_matches(md, maxlength): | |
230 md_pos = 0 | |
231 | |
232 while md and md_pos < maxlength: | |
233 tmp = '0' # preload a zero so that immediate mismatches will be caught | |
234 # the zero will have no affect otherwise... | |
235 | |
236 # look for matches | |
237 while md and md[0] in '0123456789': | |
238 tmp += md[0] | |
239 md = md[1:] | |
240 | |
241 pos = int(tmp) | |
242 if pos > maxlength: | |
243 return (maxlength, '%s%s' % (pos - maxlength, md)) | |
244 return (pos, md) | |
245 | |
246 | |
247 def read_calc_variations(read): | |
248 'see _read_calc_variations' | |
249 for tup in _read_calc_variations(read.pos, read.cigar, read.opt('MD'), read.seq): | |
250 yield tup | |
251 | |
252 | |
253 def _read_calc_variations(start_pos, cigar, md, seq): | |
254 ''' | |
255 For each variation, outputs a tuple: (op, pos, seq) | |
256 | |
257 op - operation (0 = mismatch, 1 = insert, 2 = deletion) (like CIGAR) | |
258 pos - 0-based position of the variation (relative to reference) | |
259 seq - the base (or bases) involved in the variation | |
260 for mismatch or insert, this is the sequence inserted | |
261 for deletions, this is the reference sequence that was removed | |
262 | |
263 MD is the mismatch string. Not all aligners include the tag. If your aligner | |
264 doesn't include this, then you'll need to add it, or use a different function | |
265 (see: read_calc_mismatches_gen). | |
266 | |
267 Special care must be used to handle RNAseq reads that cross | |
268 an exon-exon junction. | |
269 | |
270 Also: MD is a *really* dumb format that can't be read correctly with | |
271 a regex. It must be processed in concert with the CIGAR alignment | |
272 in order to catch all edge cases. Some implementations insert 0's | |
273 at the end of inserts / deltions / variations to make parsing easier | |
274 but not everyone follows this. Look at the complex examples: the | |
275 CIGAR alignment may show an insert, but the MD just shows all matches. | |
276 | |
277 Examples: See: http://davetang.org/muse/2011/01/28/perl-and-sam/ | |
278 Also from CCBB actual mappings and manual altered (shortened, | |
279 made more complex) | |
280 (doctests included) | |
281 | |
282 Match/mismatch | |
283 CIGAR: 36M | |
284 MD:Z: 1A0C0C0C1T0C0T27 | |
285 MD:Z: 1ACCC1TCT27 (alternative) | |
286 1 2 | |
287 123456789012345678901234567890123456 | |
288 ref: CGATACGGGGACATCCGGCCTGCTCCTTCTCACATG | |
289 XXXX XXX | |
290 read: CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG | |
291 MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM | |
292 -ACCC-TCT--------------------------- | |
293 >>> list(_read_calc_variations(1, [(0,36)], '1A0C0C0C1T0C0T27', 'CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG')) | |
294 [(0, 2, 'A'), (0, 3, 'C'), (0, 4, 'C'), (0, 5, 'C'), (0, 7, 'T'), (0, 8, 'C'), (0, 9, 'T')] | |
295 | |
296 Insert | |
297 CIGAR: 6M1I29M | |
298 MD:Z: 0C1C0C1C0T0C27 | |
299 C1CC1CTC27 (alt) | |
300 1 2 | |
301 123456^789012345678901234567890123456 | |
302 ref: CACCCC^TCTGACATCCGGCCTGCTCCTTCTCACAT | |
303 X XX X|XX | |
304 read: GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT | |
305 MMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMM | |
306 G-GA-GGGG--------------------------- | |
307 >>> list(_read_calc_variations(1, [(0,6), (1,1), (0, 29)], '0C1C0C1C0T0C27', 'GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT')) | |
308 [(0, 1, 'G'), (0, 3, 'G'), (0, 4, 'A'), (0, 6, 'G'), (1, 7, 'G'), (0, 7, 'G'), (0, 8, 'G')] | |
309 >>> list(_read_calc_variations(1, [(0,6), (1,1), (0, 29)], 'C1CC1CTC27', 'GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT')) | |
310 [(0, 1, 'G'), (0, 3, 'G'), (0, 4, 'A'), (0, 6, 'G'), (1, 7, 'G'), (0, 7, 'G'), (0, 8, 'G')] | |
311 | |
312 | |
313 Deletion | |
314 CIGAR: 9M9D27M | |
315 MD:Z: 2G0A5^ATGATGTCA27 | |
316 2GA5^ATGATGTCA27 (alt) | |
317 ref: AGGAATGGGATGATGTCAGGGGTTCCAGGTGGAGACGAGGACTCC | |
318 XX ^^^^^^^^^ | |
319 read: AGTGATGGG^^^^^^^^^GGGGTTCCAGGTGGAGACGAGGACTCC | |
320 MMMMMMMMMDDDDDDDDDMMMMMMMMMMMMMMMMMMMMMMMMMMM | |
321 --TG-----ATGATGTCA--------------------------- | |
322 >>> list(_read_calc_variations(1, [(0,9), (2,9), (0, 27)], '2G0A5^ATGATGTCA27', 'AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC')) | |
323 [(0, 3, 'T'), (0, 4, 'G'), (2, 10, 'ATGATGTCA')] | |
324 | |
325 | |
326 Complex | |
327 CIGAR: 9M9D11M1I15M | |
328 MD:Z: 2G0A5^ATGATGTCAA26 | |
329 MD:Z: 2G0A5^ATGATGTCA0G26 (alt) | |
330 1 2 3 4 | |
331 pos: 123456789012345678901234567890123456789012345 | |
332 ref: AGGAATGGGATGATGTCAGGGGTTCCAGG^GGAGACGAGGACTCC | |
333 XX ^^^^^^^^^X | | |
334 read: AGTGATGGG^^^^^^^^^AGGGTTCCAGGTGGAGACGAGGACTCC | |
335 MMMMMMMMMDDDDDDDDDMMMMMMMMMMMMMMMMMMMMMMMMMMM | |
336 --TG-----ATGATGTCAG----------T--------------- | |
337 >>> list(_read_calc_variations(1, [(0,9), (2,9), (0,11), (1,1), (0,15)], '2G0A5^ATGATGTCAA26', 'AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC')) | |
338 [(0, 3, 'T'), (0, 4, 'G'), (2, 10, 'ATGATGTCA'), (0, 19, 'G'), (1, 30, 'T')] | |
339 | |
340 | |
341 Complex example - inserts aren't separately handled by MD, only visible in CIGAR | |
342 CIGAR: 14M2D16M3I42M | |
343 MD:Z: 14^TC58 | |
344 1 2 3 4 5 6 7 | |
345 pos: 12345678901234567890123456789012^^^345678901234567890123456789012345678901234567 | |
346 ref: caagtatcaccatgtcaggcatttttttcatt^^^tttgtagagagagaagacttgctatgttgcccaagctggcct | |
347 ^^ ||| | |
348 read: CAAGTATCACCATG^^AGGCATTTTTTTCATTTGGTTTGTAGAGAGAGAAGACTTGCTATGTTGCCCAAGCTGGCCT | |
349 MMMMMMMMMMMMMMDDMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM | |
350 --------------tc----------------TGG------------------------------------------ | |
351 >>> list(_read_calc_variations(1, [(0,14), (2,2), (0,16), (1,3), (0,42)], '14^TC58', 'CAAGTATCACCATGAGGCATTTTTTTCATTTGGTTTGTAGAGAGAGAAGACTTGCTATGTTGCCCAAGCTGGCCT')) | |
352 [(2, 15, 'TC'), (1, 33, 'TGG')] | |
353 | |
354 | |
355 Complex example 2: | |
356 CIGAR: 41M3I10M1I5M1I2M2I10M | |
357 MD:Z: 44C2C6T6T6 | |
358 1 2 3 4 5 6 | |
359 pos: 12345678901234567890123456789012345678901^^^2345678901^23456^78^^9012345678 | |
360 ref: AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG | |
361 ||| X X | X | || X | |
362 read: AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG | |
363 MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM | |
364 -----------------------------------------tta---A--T---c---A----gt---G------ | |
365 | |
366 | |
367 13M 28M 3I 10M 1I 5M 1I 2M 2I 10M | |
368 >>> list(_read_calc_variations(1, [(0, 41), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '44C2C6T6T6', 'AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG')) | |
369 [(1, 42, 'TTA'), (0, 45, 'A'), (0, 48, 'T'), (1, 52, 'C'), (0, 55, 'A'), (1, 57, 'G'), (1, 59, 'GT'), (0, 62, 'G')] | |
370 | |
371 | |
372 Splice junction example: | |
373 CIGAR: 62M100N13M | |
374 MD:Z: 2T27C44 | |
375 1 1 | |
376 1 2 3 4 5 6 6 7 | |
377 pos: 12345678901234567890123456789012345678901234567890123456789012| [100] |3456789012345 | |
378 ref: CCTCATGACCAGCTTGTTGAAGAGATCCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCA|-------|CTTGCTCCTGCTC | |
379 X X | |
380 read: CCGCATGACCAGCTTGTTGAAGAGATCCGATATCAAGTGCCCACCTTGGCTCGTGGCTCTCA|-------|CTTGCTCCTGCTC | |
381 --G---------------------------T----------------------------------------------------- | |
382 | |
383 >>> list(_read_calc_variations(1, [(0,62), (4,100), (0,13)], '2T27C44', 'CCGCATGACCAGCTTGTTGAAGAGATCCGATATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTC')) | |
384 [(0, 3, 'G'), (0, 31, 'T')] | |
385 | |
386 | |
387 Splice junction example 2: | |
388 CIGAR: 13M100N28M3I10M1I5M1I2M2I10M | |
389 MD:Z: 44C2C6T6T6 | |
390 1 1 1 1 1 | |
391 1 2 3 4 5 6 | |
392 pos: 1234567890123| [100] |4567890123456789012345678901^^^2345678901^23456^78^^9012345678 | |
393 ref: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG | |
394 ||| X X | X | || X | |
395 read: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG | |
396 MMMMMMMMMMMMM MMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM | |
397 ------------- ----------------------------tta---A--T---c---A----gt---G------ | |
398 | |
399 13M 100N 28M 3I 10M 1I 5M 1I 2M 2I 10M | |
400 >>> list(_read_calc_variations(1, [(0, 13), (3, 100), (0, 28), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '44C2C6T6T6', 'AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG')) | |
401 [(1, 142, 'TTA'), (0, 145, 'A'), (0, 148, 'T'), (1, 152, 'C'), (0, 155, 'A'), (1, 157, 'G'), (1, 159, 'GT'), (0, 162, 'G')] | |
402 | |
403 | |
404 Splice junction example 2A: | |
405 CIGAR: 13M100N7M2D19M3I10M1I5M1I2M2I10M | |
406 MD:Z: 9A10^GG22C2C6T6T6 | |
407 1 1 1 1 1 | |
408 1 2 3 4 5 6 | |
409 pos: 1234567890123| [100] |4567890123456789012345678901^^^2345678901^23456^78^^9012345678 | |
410 ref: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG | |
411 ^^ ||| X X | X | || X | |
412 read: AGGGTGGCGCGAT|-------|CGATGAC^^CATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG | |
413 MMMMMMMMMMMMM MMMMMMMDDMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM | |
414 ---------C--- ----------------------------tta---A--T---c---A----gt---G------ | |
415 .........A... .......GG................... ...C..C... ...T. .. ...T...... | |
416 9 A 10 ^GG 22 C 2C 6 T 6 T 6 | |
417 | |
418 >>> list(_read_calc_variations(1, [(0, 13), (3, 100), (0, 7), (2, 2), (0, 19), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '9A10^GG22C2C6T6T6', 'AGGGTGGCGCGATCGATGACCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG')) | |
419 [(0, 10, 'C'), (2, 121, 'GG'), (1, 142, 'TTA'), (0, 145, 'A'), (0, 148, 'T'), (1, 152, 'C'), (0, 155, 'A'), (1, 157, 'G'), (1, 159, 'GT'), (0, 162, 'G')] | |
420 | |
421 Real Example | |
422 242_1071_1799_B1 | |
423 CIGAR: 42M10I3M1D9M1D11M | |
424 MD:Z: 27G16A0^T6C2^T1C9 | |
425 1 2 3 4 5 6 7 | |
426 pos: 123456789012345678901234567890123456789012 345678901234567890123456789012345 | |
427 ref: ACTGAGAAACCCAACCCTCTGAGACCAGCACACCCCTTTCAA^^^^^^^^^^GCATGTTCCTCCCTCCCCTTCTTTG | |
428 X X^ X ^ X | |
429 read: ACTGAGAAACCCAACCCTCTGAGACCAACACACCCCTTTCAACACATTTTTGGCC^GTTCCTGCC^CGCCTTCTTTG | |
430 MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIIIIIIIIMMMDMMMMMMMMMDMMMMMMMMMMM | |
431 ---------------------------A--------------^^^^^^^^^^--CT------G--T-G--------- | |
432 | |
433 >>> list(_read_calc_variations(1, [(0,42), (1,10), (0, 3), (2, 1), (0, 9), (2, 1), (0, 11)], '27G16A0^T6C2^T1C9', 'ACTGAGAAACCCAACCCTCTGAGACCAACACACCCCTTTCAACACATTTTTGGCCGTTCCTGCCCGCCTTCTTTG', )) | |
434 [(0, 28, 'A'), (1, 43, 'CACATTTTTG'), (0, 45, 'C'), (2, 46, 'T'), (0, 53, 'G'), (2, 56, 'T'), (0, 58, 'G')] | |
435 | |
436 | |
437 Real example 2 | |
438 577_1692_891_A1 | |
439 CIGAR: 34M100N39M2I | |
440 MD:Z: 3T69 | |
441 1 1 1 1 | |
442 1 2 3 4 5 6 7 | |
443 pos: 1234567890123456789012345678901234| [100] |567890123456789012345678901234567890123 | |
444 ref: GGATTCTTCCCACTGGGTCGATGTTGTTTGTGAT|-------|CTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTC^^ | |
445 | |
446 read: GGAATCTTCCCACTGGGTCGATGTTGTTTGTGAT|-------|CTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTCTC | |
447 MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM NNNNN MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMII | |
448 ---A------------------------------ ---------------------------------------TC | |
449 | |
450 >>> list(_read_calc_variations(1, [(0,34), (3,100), (0, 39), (1, 2)], '3T69', 'GGAATCTTCCCACTGGGTCGATGTTGTTTGTGATCTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTCTC', )) | |
451 [(0, 4, 'A'), (1, 174, 'TC')] | |
452 | |
453 ''' | |
454 | |
455 ref_pos = start_pos | |
456 read_pos = 0 | |
457 | |
458 for op, length in cigar: | |
459 if md and md[0] == '0': | |
460 md = md[1:] | |
461 # sys.stderr.write('%s, %s, %s\n' %(op, length, md)) | |
462 if op == 0: # M | |
463 # how far in the chunk are we? (do *not* update ref_pos until end) | |
464 md_pos = 0 | |
465 last = None | |
466 while md and md_pos < length: | |
467 if last == (op, length, md): | |
468 sys.stderr.write('\nInfinite loop in variant finding!\nPos: %s\nCIGAR: (%s, %s)\n' % (ref_pos, op, length)) | |
469 sys.exit(1) | |
470 last = (op, length, md) | |
471 # sys.stderr.write('%s, %s, %s\n' %(op, length, md)) | |
472 chunk_size, md = _extract_md_matches(md, length - md_pos) | |
473 # sys.stderr.write(' -> %s, %s\n' %(chunk_size, md)) | |
474 md_pos += chunk_size | |
475 | |
476 # look for mismatches | |
477 while md_pos < length and md and md[0] not in '0123456789^': | |
478 yield (op, ref_pos + md_pos, seq[read_pos + md_pos]) | |
479 md = md[1:] | |
480 | |
481 md_pos += 1 | |
482 | |
483 ref_pos += length | |
484 read_pos += length | |
485 | |
486 elif op == 1: # I | |
487 # nothing in MD about inserts... | |
488 yield (op, ref_pos, seq[read_pos:read_pos + length]) | |
489 read_pos += length | |
490 | |
491 elif op == 2: # D | |
492 # prefixed with '^' and includes all of the removed bases | |
493 if md[0] == '^': | |
494 md = md[1:] | |
495 yield (op, ref_pos, md[:length]) | |
496 md = md[length:] | |
497 ref_pos += length | |
498 | |
499 elif op == 3: # N | |
500 ref_pos += length | |
501 | |
502 | |
503 def read_calc_mismatches_gen(ref, read, chrom): | |
504 start = read.pos | |
505 ref_pos = 0 | |
506 read_pos = 0 | |
507 | |
508 for op, length in read.cigar: | |
509 if op == 1: | |
510 yield ref_pos, op, None | |
511 read_pos += length | |
512 elif op == 2: | |
513 yield ref_pos, op, None | |
514 ref_pos += length | |
515 elif op == 3: | |
516 ref_pos += length | |
517 elif op == 0: | |
518 refseq = ref.fetch(chrom, start + ref_pos, start + ref_pos + length) | |
519 if not refseq: | |
520 raise ValueError("Reference '%s' not found in FASTA file: %s" % (chrom, ref.filename)) | |
521 cur_pos = start + ref_pos | |
522 for refbase, readbase in zip(refseq.upper(), read.seq[read_pos:read_pos + length].upper()): | |
523 if refbase != readbase: | |
524 yield op, cur_pos, readbase | |
525 cur_pos += 1 | |
526 ref_pos += length | |
527 read_pos += length | |
528 else: | |
529 raise ValueError("Unsupported CIGAR operation: %s" % op) | |
530 | |
531 | |
532 def read_calc_mismatches_ref(ref, read, chrom): | |
533 edits = 0 | |
534 | |
535 for op, pos, base in read_calc_mismatches_gen(ref, read, chrom): | |
536 edits += 1 | |
537 | |
538 return edits | |
539 | |
540 | |
541 __region_cache = {} | |
542 | |
543 | |
544 def region_pos_to_genomic_pos(name, start, cigar): | |
545 ''' | |
546 converts a junction position to a genomic location given a junction | |
547 ref name, the junction position, and the cigar alignment. | |
548 | |
549 returns: (genomic ref, genomic pos, genomic cigar) | |
550 | |
551 >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050,3000-4000', 25, [(0, 100)]) | |
552 ('chr1', 1025, [(0, 25), (3, 950), (0, 50), (3, 950), (0, 25)]) | |
553 | |
554 >>> region_pos_to_genomic_pos('chr1:1000-1050,1050-1200', 25, [(0, 100)]) | |
555 ('chr1', 1025, [(0, 25), (3, 0), (0, 75)]) | |
556 | |
557 >>> region_pos_to_genomic_pos('chr3R:17630851-17630897,17634338-17634384', 17, [(0, 39)]) | |
558 ('chr3R', 17630868, [(0, 29), (3, 3441), (0, 10)]) | |
559 | |
560 >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(4, 25), (0, 50), (4, 25)]) | |
561 ('chr1', 1025, [(4, 25), (0, 25), (3, 950), (0, 25), (4, 25)]) | |
562 | |
563 >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(5, 25), (0, 75)]) | |
564 ('chr1', 1025, [(5, 25), (0, 25), (3, 950), (0, 50)]) | |
565 | |
566 >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(5, 25), (0, 75)]) | |
567 ('chr1', 1025, [(5, 25), (0, 25), (3, 950), (0, 50)]) | |
568 | |
569 >>> region_pos_to_genomic_pos('chr7:16829153-16829246,16829246-16829339', 62, cigar_fromstr('83M18S')) | |
570 ('chr7', 16829215, [(0, 31), (3, 0), (0, 52), (4, 18)]) | |
571 | |
572 | |
573 ''' | |
574 | |
575 if name in __region_cache: | |
576 chrom, fragments = __region_cache[name] | |
577 else: | |
578 c1 = name.split(':') | |
579 chrom = c1[0] | |
580 | |
581 fragments = [] | |
582 for fragment in c1[1].split(','): | |
583 s, e = fragment.split('-') | |
584 fragments.append((int(s), int(e))) | |
585 | |
586 __region_cache[name] = (chrom, fragments) | |
587 | |
588 chr_cigar = [] | |
589 chr_start = fragments[0][0] | |
590 | |
591 read_start = int(start) | |
592 | |
593 frag_idx = 0 | |
594 frag_start = 0 | |
595 frag_end = 0 | |
596 | |
597 for i, (s, e) in enumerate(fragments): | |
598 if chr_start + read_start < e: | |
599 chr_start += read_start | |
600 frag_idx = i | |
601 frag_start = s | |
602 frag_end = e | |
603 break | |
604 | |
605 else: | |
606 chr_start += (e - s) | |
607 read_start -= (e - s) | |
608 | |
609 cur_pos = chr_start | |
610 | |
611 for op, length in cigar: | |
612 if op in [1, 4, 5]: | |
613 chr_cigar.append((op, length)) | |
614 | |
615 elif op in [0, 2]: | |
616 if cur_pos + length <= frag_end: | |
617 cur_pos += length | |
618 chr_cigar.append((op, length)) | |
619 | |
620 else: | |
621 while cur_pos + length > frag_end: | |
622 if frag_end - cur_pos > 0: | |
623 chr_cigar.append((op, frag_end - cur_pos)) | |
624 length -= (frag_end - cur_pos) | |
625 cur_pos = frag_end | |
626 frag_idx += 1 | |
627 if len(fragments) <= frag_idx: | |
628 print 'ERROR converting: ', name, fragments | |
629 return (chrom, 0, chr_cigar) | |
630 frag_start, frag_end = fragments[frag_idx] | |
631 chr_cigar.append((3, frag_start - cur_pos)) | |
632 cur_pos = frag_start | |
633 | |
634 cur_pos = cur_pos + length | |
635 chr_cigar.append((op, length)) | |
636 else: | |
637 print "Unsupported CIGAR operation (%s)" % bam_cigar[op] | |
638 sys.exit(1) | |
639 | |
640 return (chrom, chr_start, chr_cigar) | |
641 | |
642 | |
643 def is_junction_valid(cigar, min_overlap=4): | |
644 ''' | |
645 Does the genomic cigar alignment represent a 'good' alignment. | |
646 Used for checking junction->genome alignments | |
647 | |
648 1) the alignment must not start at a splice junction | |
649 2) the alignment must not start or end with an overhang | |
650 3) the alignment must overhang the splice junction by min_overlap (4) | |
651 | |
652 | Exon1 | Intron | Exon2 | | |
653 |-----------------|oooooooooooooooo|------------------| | |
654 XXXXXXXXXXXXXXXXXXXXXXXX (bad 1) | |
655 XXXXXXXXXXXXX (bad 2) XXXXXXXXXXXXXXXX (bad 2) | |
656 XX-----------------XXXXXXXXXXXXXXXXX (bad 3) | |
657 | |
658 >>> is_junction_valid(cigar_fromstr('1000N40M')) | |
659 (False, 'Starts at gap (1000N40M)') | |
660 | |
661 >>> is_junction_valid(cigar_fromstr('100M')) | |
662 (False, "Doesn't cover junction") | |
663 | |
664 >>> is_junction_valid(cigar_fromstr('100M1000N3M'), 4) | |
665 (False, "Too short overlap at 3' (100M1000N3M)") | |
666 | |
667 >>> is_junction_valid(cigar_fromstr('2M1000N100M'), 4) | |
668 (False, "Too short overlap at 5' (2M1000N100M)") | |
669 | |
670 >>> is_junction_valid(cigar_fromstr('4M1000N100M'), 4) | |
671 (True, '') | |
672 | |
673 >>> is_junction_valid(cigar_fromstr('100M1000N4M'), 4) | |
674 (True, '') | |
675 | |
676 >>> is_junction_valid(cigar_fromstr('4M0N100M'), 4) | |
677 (True, '') | |
678 | |
679 ''' | |
680 first = True | |
681 pre_gap = True | |
682 | |
683 pre_gap_count = 0 | |
684 post_gap_count = 0 | |
685 | |
686 has_gap = False | |
687 | |
688 for op, length in cigar: | |
689 # mapping can't start at a gap | |
690 if first and op == 3: | |
691 return (False, 'Starts at gap (%s)' % cigar_tostr(cigar)) | |
692 first = False | |
693 | |
694 if op == 3: | |
695 pre_gap = False | |
696 post_gap_count = 0 | |
697 has_gap = True | |
698 | |
699 elif pre_gap: | |
700 pre_gap_count += length | |
701 else: | |
702 post_gap_count += length | |
703 | |
704 # mapping must start with more than min_overlap base match | |
705 | |
706 if not has_gap: | |
707 return (False, "Doesn't cover junction") | |
708 elif pre_gap_count < min_overlap: | |
709 return (False, "Too short overlap at 5' (%s)" % cigar_tostr(cigar)) | |
710 elif post_gap_count < min_overlap: | |
711 return (False, "Too short overlap at 3' (%s)" % cigar_tostr(cigar)) | |
712 | |
713 return True, '' | |
714 | |
715 | |
716 def read_alignment_fragments_gen(read): | |
717 ''' | |
718 Takes a read and returns the start and end positions for each match/mismatch region. | |
719 This will let us know where each read alignment "touches" the genome. | |
720 ''' | |
721 | |
722 for start, end in _read_alignment_fragments_gen(read.pos, read.cigar): | |
723 yield (start, end) | |
724 | |
725 | |
726 def _read_alignment_fragments_gen(pos, cigar): | |
727 ''' Test-able version of read_alignment_fragments_gen | |
728 >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('50M'))) | |
729 [(1, 51)] | |
730 | |
731 >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('25M100N25M'))) | |
732 [(1, 26), (126, 151)] | |
733 | |
734 >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('20M1D4M100N10M5I10M'))) | |
735 [(1, 21), (22, 26), (126, 136), (136, 146)] | |
736 ''' | |
737 ref_pos = pos | |
738 | |
739 for op, length in cigar: | |
740 if op == 0: | |
741 yield (ref_pos, ref_pos + length) | |
742 ref_pos += length | |
743 elif op == 1: | |
744 pass | |
745 elif op == 2: | |
746 ref_pos += length | |
747 elif op == 3: | |
748 ref_pos += length | |
749 else: | |
750 raise ValueError("Unsupported CIGAR operation: %s" % op) | |
751 | |
752 | |
753 def read_cigar_at_pos(cigar, qpos, is_del): | |
754 ''' | |
755 Returns the CIGAR operation for a given read position | |
756 | |
757 qpos is the 0-based index of the base in the read | |
758 ''' | |
759 pos = 0 | |
760 returnnext = False | |
761 for op, length in cigar: | |
762 if returnnext: | |
763 return op | |
764 if op == 0: | |
765 pos += length | |
766 elif op == 1: | |
767 pos += length | |
768 elif op == 2: | |
769 pass | |
770 elif op == 3: | |
771 pass | |
772 elif op == 4: | |
773 pos += length | |
774 elif op == 5: | |
775 pass | |
776 else: | |
777 raise ValueError("Unsupported CIGAR operation: %s" % op) | |
778 | |
779 if pos > qpos: | |
780 return op | |
781 if is_del and pos == qpos: | |
782 returnnext = True | |
783 | |
784 return None | |
785 | |
786 | |
787 def cleancigar(cigar): | |
788 ''' | |
789 Cleans a CIGAR alignment to remove zero length operations | |
790 | |
791 >>> cigar_tostr(cleancigar(cigar_fromstr('31M0N52M18S'))) | |
792 '83M18S' | |
793 | |
794 ''' | |
795 newcigar = [] | |
796 | |
797 changed = False | |
798 zero = False | |
799 | |
800 for op, size in cigar: | |
801 if size > 0: | |
802 if zero and newcigar: | |
803 if newcigar[-1][0] == op: | |
804 newsize = newcigar[-1][1] + size | |
805 newcigar[-1] = (op, newsize) | |
806 zero = 0 | |
807 else: | |
808 newcigar.append((op, size)) | |
809 else: | |
810 changed = True | |
811 zero = True | |
812 | |
813 if changed: | |
814 return newcigar | |
815 | |
816 return None | |
817 | |
818 | |
819 def read_cleancigar(read): | |
820 ''' | |
821 Replaces the CIGAR string for a read to remove any operations that are zero length. | |
822 | |
823 This happens to the read in-place | |
824 ''' | |
825 if read.is_unmapped: | |
826 return False | |
827 | |
828 newcigar = [] | |
829 | |
830 newcigar = cleancigar(read.cigar) | |
831 | |
832 if newcigar: | |
833 read.cigar = newcigar | |
834 return True | |
835 | |
836 return False | |
837 | |
838 | |
839 def read_to_unmapped(read, ref=None): | |
840 ''' | |
841 Converts a read from mapped to unmapped. | |
842 | |
843 Sets the 'ZR' tag to indicate the original ref/pos/cigar (if ref is passed) | |
844 ''' | |
845 | |
846 newread = pysam.AlignedRead() | |
847 | |
848 if ref: | |
849 tags = [('ZR', '%s:%s:%s' % (ref, read.pos, cigar_tostr(read.cigar)))] | |
850 | |
851 newread.is_unmapped = True | |
852 newread.mapq = 0 | |
853 newread.tlen = 0 | |
854 newread.pos = -1 | |
855 newread.pnext = -1 | |
856 newread.rnext = -1 | |
857 newread.tid = -1 | |
858 | |
859 newread.qname = read.qname | |
860 | |
861 if read.is_paired: | |
862 newread.is_paired = True | |
863 | |
864 if not read.is_unmapped and read.is_reverse: | |
865 newread.seq = ngsutils.support.revcomp(read.seq) | |
866 newread.qual = read.qual[::-1] | |
867 else: | |
868 newread.seq = read.seq | |
869 newread.qual = read.qual | |
870 | |
871 newread.tags = tags | |
872 | |
873 return newread | |
874 | |
875 | |
876 | |
877 if __name__ == '__main__': | |
878 import doctest | |
879 doctest.testmod() |