Mercurial > repos > iuc > ngsutils_bam_filter
view ngsutils/bam/__init__.py @ 1:8187a729d9f4 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit bbabad202be09c91e66c94bcbfd057e49f57c841
author | iuc |
---|---|
date | Sun, 06 Dec 2015 05:03:12 -0500 |
parents | 4e4e4093d65d |
children | 7a68005de299 |
line wrap: on
line source
import sys import os import re import pysam try: from eta import ETA except: pass import ngsutils.support def bam_open(fname, mode='r', *args, **kwargs): if fname.lower()[-4:] == '.bam': return pysam.Samfile(fname, '%sb' % mode, *args, **kwargs) return pysam.Samfile(fname, '%s' % mode, *args, **kwargs) def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None): if not quiet and bam.filename: eta = ETA(os.stat(bam.filename).st_size) else: eta = None for pileup in bam.pileup(mask=mask): pos = bam.tell() bgz_offset = pos >> 16 if not quiet: if callback: eta.print_status(bgz_offset, extra=callback(pileup)) else: eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos)) yield pileup if eta: eta.done() def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None): ''' >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)] ['A', 'B', 'E', 'C', 'D', 'F', 'Z'] ''' if os.path.exists('%s.bai' % bam.filename): # This is an indexed file, so it is ref sorted... # Meaning that we should show chrom:pos, instead of read names show_ref_pos = True eta = None if not ref: if not quiet and bam.filename: eta = ETA(os.stat(bam.filename).st_size) for read in bam: pos = bam.tell() bgz_offset = pos >> 16 if not quiet and eta: if callback: eta.print_status(bgz_offset, extra=callback(read)) elif (show_ref_pos): if read.tid > -1: eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname)) else: eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname)) else: eta.print_status(bgz_offset, extra='%s' % read.qname) yield read else: working_chrom = None if ref in bam.references: working_chrom = ref elif ref[0:3] == 'chr': # compensate for Ensembl vs UCSC ref naming if ref[3:] in bam.references: working_chrom = ref[3:] if not working_chrom: raise ValueError('Missing reference: %s' % ref) tid = bam.gettid(working_chrom) if not start: start = 0 if not end: end = bam.lengths[tid] if not quiet and bam.filename: eta = ETA(end - start) for read in bam.fetch(working_chrom, start, end): if not quiet and eta: if callback: eta.print_status(read.pos - start, extra=callback(read)) else: eta.print_status(read.pos - start, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname)) yield read if eta: eta.done() def bam_batch_reads(bam, quiet=False): ''' Batch mapping for the same reads (qname) together, this way they can all be compared/converted together. ''' reads = [] last = None for read in bam_iter(bam, quiet=quiet): if last and read.qname != last: yield reads reads = [] last = read.qname reads.append(read) if reads: yield reads bam_cigar = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X'] bam_cigar_op = { 'M': 0, 'I': 1, 'D': 2, 'N': 3, 'S': 4, 'H': 5, 'P': 6, '=': 7, 'X': 8, } def cigar_fromstr(s): ''' >>> cigar_fromstr('10M5I20M') [(0, 10), (1, 5), (0, 20)] >>> cigar_fromstr('1M1I1D1N1S1H1P') [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)] ''' ret = [] spl = re.split('([0-9]+)', s)[1:] for op, size in zip(spl[1::2], spl[::2]): ret.append((bam_cigar_op[op], int(size))) return ret def cigar_tostr(cigar): ''' >>> cigar_tostr(((0, 10), (1, 5), (0, 20))) '10M5I20M' >>> cigar_tostr(((0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1))) '1M1I1D1N1S1H1P' ''' s = '' for op, size in cigar: s += '%s%s' % (size, bam_cigar[op]) return s def cigar_read_len(cigar): ''' >>> cigar_read_len(cigar_fromstr('8M')) 8 >>> cigar_read_len(cigar_fromstr('8M100N8M')) 16 >>> cigar_read_len(cigar_fromstr('8M10I8M')) 26 >>> cigar_read_len(cigar_fromstr('8M10D8M')) 16 ''' read_pos = 0 for op, length in cigar: if op == 0: # M read_pos += length elif op == 1: # I read_pos += length elif op == 2: # D pass elif op == 3: # N pass elif op == 4: # S read_pos += length elif op == 5: # H pass elif op == 7: # = read_pos += length elif op == 8: # X read_pos += length else: raise ValueError("Unsupported CIGAR operation: %s" % op) return read_pos def read_calc_mismatches(read): inserts = 0 deletions = 0 indels = 0 edits = int(read.opt('NM')) # # NM counts the length of indels # We really just care about *if* there is an indel, not the size # for op, length in read.cigar: if op == 1: inserts += length indels += 1 elif op == 2: deletions += length indels += 1 return edits - inserts - deletions + indels def _extract_md_matches(md, maxlength): md_pos = 0 while md and md_pos < maxlength: tmp = '0' # preload a zero so that immediate mismatches will be caught # the zero will have no affect otherwise... # look for matches while md and md[0] in '0123456789': tmp += md[0] md = md[1:] pos = int(tmp) if pos > maxlength: return (maxlength, '%s%s' % (pos - maxlength, md)) return (pos, md) def read_calc_variations(read): 'see _read_calc_variations' for tup in _read_calc_variations(read.pos, read.cigar, read.opt('MD'), read.seq): yield tup def _read_calc_variations(start_pos, cigar, md, seq): ''' For each variation, outputs a tuple: (op, pos, seq) op - operation (0 = mismatch, 1 = insert, 2 = deletion) (like CIGAR) pos - 0-based position of the variation (relative to reference) seq - the base (or bases) involved in the variation for mismatch or insert, this is the sequence inserted for deletions, this is the reference sequence that was removed MD is the mismatch string. Not all aligners include the tag. If your aligner doesn't include this, then you'll need to add it, or use a different function (see: read_calc_mismatches_gen). Special care must be used to handle RNAseq reads that cross an exon-exon junction. Also: MD is a *really* dumb format that can't be read correctly with a regex. It must be processed in concert with the CIGAR alignment in order to catch all edge cases. Some implementations insert 0's at the end of inserts / deltions / variations to make parsing easier but not everyone follows this. Look at the complex examples: the CIGAR alignment may show an insert, but the MD just shows all matches. Examples: See: http://davetang.org/muse/2011/01/28/perl-and-sam/ Also from CCBB actual mappings and manual altered (shortened, made more complex) (doctests included) Match/mismatch CIGAR: 36M MD:Z: 1A0C0C0C1T0C0T27 MD:Z: 1ACCC1TCT27 (alternative) 1 2 123456789012345678901234567890123456 ref: CGATACGGGGACATCCGGCCTGCTCCTTCTCACATG XXXX XXX read: CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM -ACCC-TCT--------------------------- >>> list(_read_calc_variations(1, [(0,36)], '1A0C0C0C1T0C0T27', 'CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG')) [(0, 2, 'A'), (0, 3, 'C'), (0, 4, 'C'), (0, 5, 'C'), (0, 7, 'T'), (0, 8, 'C'), (0, 9, 'T')] Insert CIGAR: 6M1I29M MD:Z: 0C1C0C1C0T0C27 C1CC1CTC27 (alt) 1 2 123456^789012345678901234567890123456 ref: CACCCC^TCTGACATCCGGCCTGCTCCTTCTCACAT X XX X|XX read: GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT MMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMM G-GA-GGGG--------------------------- >>> list(_read_calc_variations(1, [(0,6), (1,1), (0, 29)], '0C1C0C1C0T0C27', 'GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT')) [(0, 1, 'G'), (0, 3, 'G'), (0, 4, 'A'), (0, 6, 'G'), (1, 7, 'G'), (0, 7, 'G'), (0, 8, 'G')] >>> list(_read_calc_variations(1, [(0,6), (1,1), (0, 29)], 'C1CC1CTC27', 'GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT')) [(0, 1, 'G'), (0, 3, 'G'), (0, 4, 'A'), (0, 6, 'G'), (1, 7, 'G'), (0, 7, 'G'), (0, 8, 'G')] Deletion CIGAR: 9M9D27M MD:Z: 2G0A5^ATGATGTCA27 2GA5^ATGATGTCA27 (alt) ref: AGGAATGGGATGATGTCAGGGGTTCCAGGTGGAGACGAGGACTCC XX ^^^^^^^^^ read: AGTGATGGG^^^^^^^^^GGGGTTCCAGGTGGAGACGAGGACTCC MMMMMMMMMDDDDDDDDDMMMMMMMMMMMMMMMMMMMMMMMMMMM --TG-----ATGATGTCA--------------------------- >>> list(_read_calc_variations(1, [(0,9), (2,9), (0, 27)], '2G0A5^ATGATGTCA27', 'AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC')) [(0, 3, 'T'), (0, 4, 'G'), (2, 10, 'ATGATGTCA')] Complex CIGAR: 9M9D11M1I15M MD:Z: 2G0A5^ATGATGTCAA26 MD:Z: 2G0A5^ATGATGTCA0G26 (alt) 1 2 3 4 pos: 123456789012345678901234567890123456789012345 ref: AGGAATGGGATGATGTCAGGGGTTCCAGG^GGAGACGAGGACTCC XX ^^^^^^^^^X | read: AGTGATGGG^^^^^^^^^AGGGTTCCAGGTGGAGACGAGGACTCC MMMMMMMMMDDDDDDDDDMMMMMMMMMMMMMMMMMMMMMMMMMMM --TG-----ATGATGTCAG----------T--------------- >>> list(_read_calc_variations(1, [(0,9), (2,9), (0,11), (1,1), (0,15)], '2G0A5^ATGATGTCAA26', 'AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC')) [(0, 3, 'T'), (0, 4, 'G'), (2, 10, 'ATGATGTCA'), (0, 19, 'G'), (1, 30, 'T')] Complex example - inserts aren't separately handled by MD, only visible in CIGAR CIGAR: 14M2D16M3I42M MD:Z: 14^TC58 1 2 3 4 5 6 7 pos: 12345678901234567890123456789012^^^345678901234567890123456789012345678901234567 ref: caagtatcaccatgtcaggcatttttttcatt^^^tttgtagagagagaagacttgctatgttgcccaagctggcct ^^ ||| read: CAAGTATCACCATG^^AGGCATTTTTTTCATTTGGTTTGTAGAGAGAGAAGACTTGCTATGTTGCCCAAGCTGGCCT MMMMMMMMMMMMMMDDMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM --------------tc----------------TGG------------------------------------------ >>> list(_read_calc_variations(1, [(0,14), (2,2), (0,16), (1,3), (0,42)], '14^TC58', 'CAAGTATCACCATGAGGCATTTTTTTCATTTGGTTTGTAGAGAGAGAAGACTTGCTATGTTGCCCAAGCTGGCCT')) [(2, 15, 'TC'), (1, 33, 'TGG')] Complex example 2: CIGAR: 41M3I10M1I5M1I2M2I10M MD:Z: 44C2C6T6T6 1 2 3 4 5 6 pos: 12345678901234567890123456789012345678901^^^2345678901^23456^78^^9012345678 ref: AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG ||| X X | X | || X read: AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM -----------------------------------------tta---A--T---c---A----gt---G------ 13M 28M 3I 10M 1I 5M 1I 2M 2I 10M >>> list(_read_calc_variations(1, [(0, 41), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '44C2C6T6T6', 'AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG')) [(1, 42, 'TTA'), (0, 45, 'A'), (0, 48, 'T'), (1, 52, 'C'), (0, 55, 'A'), (1, 57, 'G'), (1, 59, 'GT'), (0, 62, 'G')] Splice junction example: CIGAR: 62M100N13M MD:Z: 2T27C44 1 1 1 2 3 4 5 6 6 7 pos: 12345678901234567890123456789012345678901234567890123456789012| [100] |3456789012345 ref: CCTCATGACCAGCTTGTTGAAGAGATCCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCA|-------|CTTGCTCCTGCTC X X read: CCGCATGACCAGCTTGTTGAAGAGATCCGATATCAAGTGCCCACCTTGGCTCGTGGCTCTCA|-------|CTTGCTCCTGCTC --G---------------------------T----------------------------------------------------- >>> list(_read_calc_variations(1, [(0,62), (4,100), (0,13)], '2T27C44', 'CCGCATGACCAGCTTGTTGAAGAGATCCGATATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTC')) [(0, 3, 'G'), (0, 31, 'T')] Splice junction example 2: CIGAR: 13M100N28M3I10M1I5M1I2M2I10M MD:Z: 44C2C6T6T6 1 1 1 1 1 1 2 3 4 5 6 pos: 1234567890123| [100] |4567890123456789012345678901^^^2345678901^23456^78^^9012345678 ref: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG ||| X X | X | || X read: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG MMMMMMMMMMMMM MMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM ------------- ----------------------------tta---A--T---c---A----gt---G------ 13M 100N 28M 3I 10M 1I 5M 1I 2M 2I 10M >>> list(_read_calc_variations(1, [(0, 13), (3, 100), (0, 28), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '44C2C6T6T6', 'AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG')) [(1, 142, 'TTA'), (0, 145, 'A'), (0, 148, 'T'), (1, 152, 'C'), (0, 155, 'A'), (1, 157, 'G'), (1, 159, 'GT'), (0, 162, 'G')] Splice junction example 2A: CIGAR: 13M100N7M2D19M3I10M1I5M1I2M2I10M MD:Z: 9A10^GG22C2C6T6T6 1 1 1 1 1 1 2 3 4 5 6 pos: 1234567890123| [100] |4567890123456789012345678901^^^2345678901^23456^78^^9012345678 ref: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG ^^ ||| X X | X | || X read: AGGGTGGCGCGAT|-------|CGATGAC^^CATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG MMMMMMMMMMMMM MMMMMMMDDMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM ---------C--- ----------------------------tta---A--T---c---A----gt---G------ .........A... .......GG................... ...C..C... ...T. .. ...T...... 9 A 10 ^GG 22 C 2C 6 T 6 T 6 >>> list(_read_calc_variations(1, [(0, 13), (3, 100), (0, 7), (2, 2), (0, 19), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '9A10^GG22C2C6T6T6', 'AGGGTGGCGCGATCGATGACCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG')) [(0, 10, 'C'), (2, 121, 'GG'), (1, 142, 'TTA'), (0, 145, 'A'), (0, 148, 'T'), (1, 152, 'C'), (0, 155, 'A'), (1, 157, 'G'), (1, 159, 'GT'), (0, 162, 'G')] Real Example 242_1071_1799_B1 CIGAR: 42M10I3M1D9M1D11M MD:Z: 27G16A0^T6C2^T1C9 1 2 3 4 5 6 7 pos: 123456789012345678901234567890123456789012 345678901234567890123456789012345 ref: ACTGAGAAACCCAACCCTCTGAGACCAGCACACCCCTTTCAA^^^^^^^^^^GCATGTTCCTCCCTCCCCTTCTTTG X X^ X ^ X read: ACTGAGAAACCCAACCCTCTGAGACCAACACACCCCTTTCAACACATTTTTGGCC^GTTCCTGCC^CGCCTTCTTTG MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIIIIIIIIMMMDMMMMMMMMMDMMMMMMMMMMM ---------------------------A--------------^^^^^^^^^^--CT------G--T-G--------- >>> list(_read_calc_variations(1, [(0,42), (1,10), (0, 3), (2, 1), (0, 9), (2, 1), (0, 11)], '27G16A0^T6C2^T1C9', 'ACTGAGAAACCCAACCCTCTGAGACCAACACACCCCTTTCAACACATTTTTGGCCGTTCCTGCCCGCCTTCTTTG', )) [(0, 28, 'A'), (1, 43, 'CACATTTTTG'), (0, 45, 'C'), (2, 46, 'T'), (0, 53, 'G'), (2, 56, 'T'), (0, 58, 'G')] Real example 2 577_1692_891_A1 CIGAR: 34M100N39M2I MD:Z: 3T69 1 1 1 1 1 2 3 4 5 6 7 pos: 1234567890123456789012345678901234| [100] |567890123456789012345678901234567890123 ref: GGATTCTTCCCACTGGGTCGATGTTGTTTGTGAT|-------|CTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTC^^ read: GGAATCTTCCCACTGGGTCGATGTTGTTTGTGAT|-------|CTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTCTC MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM NNNNN MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMII ---A------------------------------ ---------------------------------------TC >>> list(_read_calc_variations(1, [(0,34), (3,100), (0, 39), (1, 2)], '3T69', 'GGAATCTTCCCACTGGGTCGATGTTGTTTGTGATCTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTCTC', )) [(0, 4, 'A'), (1, 174, 'TC')] ''' ref_pos = start_pos read_pos = 0 for op, length in cigar: if md and md[0] == '0': md = md[1:] # sys.stderr.write('%s, %s, %s\n' %(op, length, md)) if op == 0: # M # how far in the chunk are we? (do *not* update ref_pos until end) md_pos = 0 last = None while md and md_pos < length: if last == (op, length, md): sys.stderr.write('\nInfinite loop in variant finding!\nPos: %s\nCIGAR: (%s, %s)\n' % (ref_pos, op, length)) sys.exit(1) last = (op, length, md) # sys.stderr.write('%s, %s, %s\n' %(op, length, md)) chunk_size, md = _extract_md_matches(md, length - md_pos) # sys.stderr.write(' -> %s, %s\n' %(chunk_size, md)) md_pos += chunk_size # look for mismatches while md_pos < length and md and md[0] not in '0123456789^': yield (op, ref_pos + md_pos, seq[read_pos + md_pos]) md = md[1:] md_pos += 1 ref_pos += length read_pos += length elif op == 1: # I # nothing in MD about inserts... yield (op, ref_pos, seq[read_pos:read_pos + length]) read_pos += length elif op == 2: # D # prefixed with '^' and includes all of the removed bases if md[0] == '^': md = md[1:] yield (op, ref_pos, md[:length]) md = md[length:] ref_pos += length elif op == 3: # N ref_pos += length def read_calc_mismatches_gen(ref, read, chrom): start = read.pos ref_pos = 0 read_pos = 0 for op, length in read.cigar: if op == 1: yield ref_pos, op, None read_pos += length elif op == 2: yield ref_pos, op, None ref_pos += length elif op == 3: ref_pos += length elif op == 0: refseq = ref.fetch(chrom, start + ref_pos, start + ref_pos + length) if not refseq: raise ValueError("Reference '%s' not found in FASTA file: %s" % (chrom, ref.filename)) cur_pos = start + ref_pos for refbase, readbase in zip(refseq.upper(), read.seq[read_pos:read_pos + length].upper()): if refbase != readbase: yield op, cur_pos, readbase cur_pos += 1 ref_pos += length read_pos += length else: raise ValueError("Unsupported CIGAR operation: %s" % op) def read_calc_mismatches_ref(ref, read, chrom): edits = 0 for op, pos, base in read_calc_mismatches_gen(ref, read, chrom): edits += 1 return edits __region_cache = {} def region_pos_to_genomic_pos(name, start, cigar): ''' converts a junction position to a genomic location given a junction ref name, the junction position, and the cigar alignment. returns: (genomic ref, genomic pos, genomic cigar) >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050,3000-4000', 25, [(0, 100)]) ('chr1', 1025, [(0, 25), (3, 950), (0, 50), (3, 950), (0, 25)]) >>> region_pos_to_genomic_pos('chr1:1000-1050,1050-1200', 25, [(0, 100)]) ('chr1', 1025, [(0, 25), (3, 0), (0, 75)]) >>> region_pos_to_genomic_pos('chr3R:17630851-17630897,17634338-17634384', 17, [(0, 39)]) ('chr3R', 17630868, [(0, 29), (3, 3441), (0, 10)]) >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(4, 25), (0, 50), (4, 25)]) ('chr1', 1025, [(4, 25), (0, 25), (3, 950), (0, 25), (4, 25)]) >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(5, 25), (0, 75)]) ('chr1', 1025, [(5, 25), (0, 25), (3, 950), (0, 50)]) >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(5, 25), (0, 75)]) ('chr1', 1025, [(5, 25), (0, 25), (3, 950), (0, 50)]) >>> region_pos_to_genomic_pos('chr7:16829153-16829246,16829246-16829339', 62, cigar_fromstr('83M18S')) ('chr7', 16829215, [(0, 31), (3, 0), (0, 52), (4, 18)]) ''' if name in __region_cache: chrom, fragments = __region_cache[name] else: c1 = name.split(':') chrom = c1[0] fragments = [] for fragment in c1[1].split(','): s, e = fragment.split('-') fragments.append((int(s), int(e))) __region_cache[name] = (chrom, fragments) chr_cigar = [] chr_start = fragments[0][0] read_start = int(start) frag_idx = 0 frag_start = 0 frag_end = 0 for i, (s, e) in enumerate(fragments): if chr_start + read_start < e: chr_start += read_start frag_idx = i frag_start = s frag_end = e break else: chr_start += (e - s) read_start -= (e - s) cur_pos = chr_start for op, length in cigar: if op in [1, 4, 5]: chr_cigar.append((op, length)) elif op in [0, 2]: if cur_pos + length <= frag_end: cur_pos += length chr_cigar.append((op, length)) else: while cur_pos + length > frag_end: if frag_end - cur_pos > 0: chr_cigar.append((op, frag_end - cur_pos)) length -= (frag_end - cur_pos) cur_pos = frag_end frag_idx += 1 if len(fragments) <= frag_idx: print 'ERROR converting: ', name, fragments return (chrom, 0, chr_cigar) frag_start, frag_end = fragments[frag_idx] chr_cigar.append((3, frag_start - cur_pos)) cur_pos = frag_start cur_pos = cur_pos + length chr_cigar.append((op, length)) else: print "Unsupported CIGAR operation (%s)" % bam_cigar[op] sys.exit(1) return (chrom, chr_start, chr_cigar) def is_junction_valid(cigar, min_overlap=4): ''' Does the genomic cigar alignment represent a 'good' alignment. Used for checking junction->genome alignments 1) the alignment must not start at a splice junction 2) the alignment must not start or end with an overhang 3) the alignment must overhang the splice junction by min_overlap (4) | Exon1 | Intron | Exon2 | |-----------------|oooooooooooooooo|------------------| XXXXXXXXXXXXXXXXXXXXXXXX (bad 1) XXXXXXXXXXXXX (bad 2) XXXXXXXXXXXXXXXX (bad 2) XX-----------------XXXXXXXXXXXXXXXXX (bad 3) >>> is_junction_valid(cigar_fromstr('1000N40M')) (False, 'Starts at gap (1000N40M)') >>> is_junction_valid(cigar_fromstr('100M')) (False, "Doesn't cover junction") >>> is_junction_valid(cigar_fromstr('100M1000N3M'), 4) (False, "Too short overlap at 3' (100M1000N3M)") >>> is_junction_valid(cigar_fromstr('2M1000N100M'), 4) (False, "Too short overlap at 5' (2M1000N100M)") >>> is_junction_valid(cigar_fromstr('4M1000N100M'), 4) (True, '') >>> is_junction_valid(cigar_fromstr('100M1000N4M'), 4) (True, '') >>> is_junction_valid(cigar_fromstr('4M0N100M'), 4) (True, '') ''' first = True pre_gap = True pre_gap_count = 0 post_gap_count = 0 has_gap = False for op, length in cigar: # mapping can't start at a gap if first and op == 3: return (False, 'Starts at gap (%s)' % cigar_tostr(cigar)) first = False if op == 3: pre_gap = False post_gap_count = 0 has_gap = True elif pre_gap: pre_gap_count += length else: post_gap_count += length # mapping must start with more than min_overlap base match if not has_gap: return (False, "Doesn't cover junction") elif pre_gap_count < min_overlap: return (False, "Too short overlap at 5' (%s)" % cigar_tostr(cigar)) elif post_gap_count < min_overlap: return (False, "Too short overlap at 3' (%s)" % cigar_tostr(cigar)) return True, '' def read_alignment_fragments_gen(read): ''' Takes a read and returns the start and end positions for each match/mismatch region. This will let us know where each read alignment "touches" the genome. ''' for start, end in _read_alignment_fragments_gen(read.pos, read.cigar): yield (start, end) def _read_alignment_fragments_gen(pos, cigar): ''' Test-able version of read_alignment_fragments_gen >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('50M'))) [(1, 51)] >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('25M100N25M'))) [(1, 26), (126, 151)] >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('20M1D4M100N10M5I10M'))) [(1, 21), (22, 26), (126, 136), (136, 146)] ''' ref_pos = pos for op, length in cigar: if op == 0: yield (ref_pos, ref_pos + length) ref_pos += length elif op == 1: pass elif op == 2: ref_pos += length elif op == 3: ref_pos += length else: raise ValueError("Unsupported CIGAR operation: %s" % op) def read_cigar_at_pos(cigar, qpos, is_del): ''' Returns the CIGAR operation for a given read position qpos is the 0-based index of the base in the read ''' pos = 0 returnnext = False for op, length in cigar: if returnnext: return op if op == 0: pos += length elif op == 1: pos += length elif op == 2: pass elif op == 3: pass elif op == 4: pos += length elif op == 5: pass else: raise ValueError("Unsupported CIGAR operation: %s" % op) if pos > qpos: return op if is_del and pos == qpos: returnnext = True return None def cleancigar(cigar): ''' Cleans a CIGAR alignment to remove zero length operations >>> cigar_tostr(cleancigar(cigar_fromstr('31M0N52M18S'))) '83M18S' ''' newcigar = [] changed = False zero = False for op, size in cigar: if size > 0: if zero and newcigar: if newcigar[-1][0] == op: newsize = newcigar[-1][1] + size newcigar[-1] = (op, newsize) zero = 0 else: newcigar.append((op, size)) else: changed = True zero = True if changed: return newcigar return None def read_cleancigar(read): ''' Replaces the CIGAR string for a read to remove any operations that are zero length. This happens to the read in-place ''' if read.is_unmapped: return False newcigar = [] newcigar = cleancigar(read.cigar) if newcigar: read.cigar = newcigar return True return False def read_to_unmapped(read, ref=None): ''' Converts a read from mapped to unmapped. Sets the 'ZR' tag to indicate the original ref/pos/cigar (if ref is passed) ''' newread = pysam.AlignedRead() if ref: tags = [('ZR', '%s:%s:%s' % (ref, read.pos, cigar_tostr(read.cigar)))] newread.is_unmapped = True newread.mapq = 0 newread.tlen = 0 newread.pos = -1 newread.pnext = -1 newread.rnext = -1 newread.tid = -1 newread.qname = read.qname if read.is_paired: newread.is_paired = True if not read.is_unmapped and read.is_reverse: newread.seq = ngsutils.support.revcomp(read.seq) newread.qual = read.qual[::-1] else: newread.seq = read.seq newread.qual = read.qual newread.tags = tags return newread if __name__ == '__main__': import doctest doctest.testmod()