# HG changeset patch # User iuc # Date 1508779622 14400 # Node ID 9b9ae5963d3c4a439188bea1a07c86104a355688 # Parent 7a68005de299659a1d125e1c5237ed98bf51da2c planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ngsutils commit 4a3c9f195ba5d899b1a1ce5e80281cdf230f456a diff -r 7a68005de299 -r 9b9ae5963d3c bam_filter.xml --- a/bam_filter.xml Sun Nov 27 15:01:21 2016 -0500 +++ b/bam_filter.xml Mon Oct 23 13:27:02 2017 -0400 @@ -1,4 +1,4 @@ - + Removes reads from a BAM file based on criteria macros.xml @@ -7,67 +7,61 @@ - +## If the tool is executed with no filtering option, +## the default parameters simply copy over the input file +if grep -q "\w" '${parameters}'; then + '$__tool_directory__/filter.py' + '$infile' + '$outfile' + `cat ${parameters}`; +else + cp '$infile' '$outfile'; +fi + ]]> - - -1: - -uniq - #if int($uniq) > 0: - $uniq - #end if - #end if - $uniq_start - #if $mismatch: - -mismatch $mismatch - #end if - $nosecondary - $noqcfail - $nopcrdup - #if $excludebed: - -excludebed ${excludebed} $ignore_strand - #end if - #if $includebed: - -includebed ${includebed} $ignore_strand - #end if - #if $includeref: - -includeref ${includeref} - #end if - #if $excluderef: - -excluderef ${excluderef} - #end if - #if $maximum_mismatch_ratio - -maximum_mismatch_ratio $maximum_mismatch_ratio - #end if - ]]> - + -1: + -uniq + #if int($uniq) > 0: + $uniq + #end if +#end if +$uniq_start +#if $mismatch: + -mismatch $mismatch +#end if +$nosecondary +$noqcfail +$nopcrdup +#if $excludebed: + -excludebed ${excludebed} $ignore_strand +#end if +#if $includebed: + -includebed ${includebed} $ignore_strand +#end if +#if $includeref: + -includeref ${includeref} +#end if +#if $excluderef: + -excluderef ${excluderef} +#end if +#if $maximum_mismatch_ratio + -maximum_mismatch_ratio $maximum_mismatch_ratio +#end if + ]]> @@ -129,7 +123,6 @@ - @@ -234,6 +227,5 @@ | -excluderef refname | Exclude reads mapped to a particular reference | | | (e.g. chrM, or _dup chromosomes) | +--------------------------------+-------------------------------------------------+ -]]> - + ]]> diff -r 7a68005de299 -r 9b9ae5963d3c filter.py --- a/filter.py Sun Nov 27 15:01:21 2016 -0500 +++ b/filter.py Mon Oct 23 13:27:02 2017 -0400 @@ -107,6 +107,7 @@ The tag type (:i, :f, :Z) is optional. """ +from __future__ import print_function import os import sys @@ -118,8 +119,8 @@ def usage(): - print __doc__ - print """ + print(__doc__) + print(""" Usage: bamutils filter in.bam out.bam {-failed out.txt} criteria... Options: @@ -131,7 +132,7 @@ This will remove all unmapped reads, as well as any reads that have an AS:i value less than 1000. -""" +""") sys.exit(1) @@ -760,10 +761,10 @@ # guess at type... try: self.value = int(value) - except: + except ValueError: try: self.value = float(value) - except: + except ValueError: self.value = value def get_value(self, read): @@ -960,7 +961,7 @@ outfile = arg elif arg[0] == '-': if not arg[1:] in _criteria: - print "Unknown criterion: %s" % arg + print("Unknown criterion: %s" % arg) fail = True if crit_args: criteria.append(_criteria[crit_args[0][1:]](*crit_args[1:])) @@ -968,7 +969,7 @@ elif crit_args: crit_args.append(arg) else: - print "Unknown argument: %s" % arg + print("Unknown argument: %s" % arg) fail = True if not fail and crit_args: @@ -979,11 +980,11 @@ usage() if not infile: - print "Missing: input bamfile" + print("Missing: input bamfile") if not outfile: - print "Missing: output bamfile" + print("Missing: output bamfile") if not criteria: - print "Missing: filtering criteria" + print("Missing: filtering criteria") usage() else: bam_filter(infile, outfile, criteria, failed, verbose) diff -r 7a68005de299 -r 9b9ae5963d3c macros.xml --- a/macros.xml Sun Nov 27 15:01:21 2016 -0500 +++ b/macros.xml Mon Oct 23 13:27:02 2017 -0400 @@ -1,8 +1,8 @@ - 0.5.8 + 0.5.9 - pysam + ngsutils diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/__init__.py diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/__init__.pyc Binary file ngsutils/__init__.pyc has changed diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/bam/__init__.py --- a/ngsutils/bam/__init__.py Sun Nov 27 15:01:21 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,880 +0,0 @@ -import os -import re -import sys - -import ngsutils.support -import pysam -try: - from eta import ETA -except: - pass - - -def bam_open(fname, mode='r', *args, **kwargs): - if fname.lower()[-4:] == '.bam': - return pysam.Samfile(fname, '%sb' % mode, *args, **kwargs) - return pysam.Samfile(fname, '%s' % mode, *args, **kwargs) - - -def bam_pileup_iter(bam, mask=1796, quiet=False, callback=None): - if not quiet and bam.filename: - eta = ETA(os.stat(bam.filename).st_size) - else: - eta = None - - for pileup in bam.pileup(mask=mask): - pos = bam.tell() - bgz_offset = pos >> 16 - - if not quiet: - if callback: - eta.print_status(bgz_offset, extra=callback(pileup)) - else: - eta.print_status(bgz_offset, extra='%s:%s' % (bam.getrname(pileup.tid), pileup.pos)) - - yield pileup - - if eta: - eta.done() - - -def bam_iter(bam, quiet=False, show_ref_pos=False, ref=None, start=None, end=None, callback=None): - ''' - >>> [x.qname for x in bam_iter(bam_open(os.path.join(os.path.dirname(__file__), 't', 'test.bam')), quiet=True)] - ['A', 'B', 'E', 'C', 'D', 'F', 'Z'] - ''' - - if os.path.exists('%s.bai' % bam.filename): - # This is an indexed file, so it is ref sorted... - # Meaning that we should show chrom:pos, instead of read names - show_ref_pos = True - - eta = None - - if not ref: - if not quiet and bam.filename: - eta = ETA(os.stat(bam.filename).st_size) - - for read in bam: - pos = bam.tell() - bgz_offset = pos >> 16 - - if not quiet and eta: - if callback: - eta.print_status(bgz_offset, extra=callback(read)) - elif (show_ref_pos): - if read.tid > -1: - eta.print_status(bgz_offset, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname)) - else: - eta.print_status(bgz_offset, extra='unmapped %s' % (read.qname)) - else: - eta.print_status(bgz_offset, extra='%s' % read.qname) - - yield read - - else: - working_chrom = None - if ref in bam.references: - working_chrom = ref - elif ref[0:3] == 'chr': - # compensate for Ensembl vs UCSC ref naming - if ref[3:] in bam.references: - working_chrom = ref[3:] - - if not working_chrom: - raise ValueError('Missing reference: %s' % ref) - - tid = bam.gettid(working_chrom) - - if not start: - start = 0 - if not end: - end = bam.lengths[tid] - - if not quiet and bam.filename: - eta = ETA(end - start) - - for read in bam.fetch(working_chrom, start, end): - if not quiet and eta: - if callback: - eta.print_status(read.pos - start, extra=callback(read)) - else: - eta.print_status(read.pos - start, extra='%s:%s %s' % (bam.getrname(read.tid), read.pos, read.qname)) - - yield read - - if eta: - eta.done() - - -def bam_batch_reads(bam, quiet=False): - ''' - Batch mapping for the same reads (qname) together, this way - they can all be compared/converted together. - ''' - reads = [] - last = None - for read in bam_iter(bam, quiet=quiet): - if last and read.qname != last: - yield reads - reads = [] - last = read.qname - reads.append(read) - - if reads: - yield reads - - -bam_cigar = ['M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X'] -bam_cigar_op = { - 'M': 0, - 'I': 1, - 'D': 2, - 'N': 3, - 'S': 4, - 'H': 5, - 'P': 6, - '=': 7, - 'X': 8, -} - - -def cigar_fromstr(s): - ''' - >>> cigar_fromstr('10M5I20M') - [(0, 10), (1, 5), (0, 20)] - >>> cigar_fromstr('1M1I1D1N1S1H1P') - [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)] - ''' - ret = [] - spl = re.split('([0-9]+)', s)[1:] - for op, size in zip(spl[1::2], spl[::2]): - ret.append((bam_cigar_op[op], int(size))) - return ret - - -def cigar_tostr(cigar): - ''' - >>> cigar_tostr(((0, 10), (1, 5), (0, 20))) - '10M5I20M' - >>> cigar_tostr(((0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1))) - '1M1I1D1N1S1H1P' - ''' - - s = '' - - for op, size in cigar: - s += '%s%s' % (size, bam_cigar[op]) - - return s - - -def cigar_read_len(cigar): - ''' - >>> cigar_read_len(cigar_fromstr('8M')) - 8 - >>> cigar_read_len(cigar_fromstr('8M100N8M')) - 16 - >>> cigar_read_len(cigar_fromstr('8M10I8M')) - 26 - >>> cigar_read_len(cigar_fromstr('8M10D8M')) - 16 - ''' - - read_pos = 0 - - for op, length in cigar: - if op == 0: # M - read_pos += length - elif op == 1: # I - read_pos += length - elif op == 2: # D - pass - elif op == 3: # N - pass - elif op == 4: # S - read_pos += length - elif op == 5: # H - pass - elif op == 7: # = - read_pos += length - elif op == 8: # X - read_pos += length - else: - raise ValueError("Unsupported CIGAR operation: %s" % op) - - return read_pos - - -def read_calc_mismatches(read): - inserts = 0 - deletions = 0 - indels = 0 - edits = int(read.opt('NM')) - # - # NM counts the length of indels - # We really just care about *if* there is an indel, not the size - # - - for op, length in read.cigar: - if op == 1: - inserts += length - indels += 1 - elif op == 2: - deletions += length - indels += 1 - - return edits - inserts - deletions + indels - - -def _extract_md_matches(md, maxlength): - md_pos = 0 - - while md and md_pos < maxlength: - # preload a zero so that immediate mismatches will be caught - # the zero will have no affect otherwise... - tmp = '0' - - # look for matches - while md and md[0] in '0123456789': - tmp += md[0] - md = md[1:] - - pos = int(tmp) - if pos > maxlength: - return (maxlength, '%s%s' % (pos - maxlength, md)) - return (pos, md) - - -def read_calc_variations(read): - 'see _read_calc_variations' - for tup in _read_calc_variations(read.pos, read.cigar, read.opt('MD'), read.seq): - yield tup - - -def _read_calc_variations(start_pos, cigar, md, seq): - ''' - For each variation, outputs a tuple: (op, pos, seq) - - op - operation (0 = mismatch, 1 = insert, 2 = deletion) (like CIGAR) - pos - 0-based position of the variation (relative to reference) - seq - the base (or bases) involved in the variation - for mismatch or insert, this is the sequence inserted - for deletions, this is the reference sequence that was removed - - MD is the mismatch string. Not all aligners include the tag. If your aligner - doesn't include this, then you'll need to add it, or use a different function - (see: read_calc_mismatches_gen). - - Special care must be used to handle RNAseq reads that cross - an exon-exon junction. - - Also: MD is a *really* dumb format that can't be read correctly with - a regex. It must be processed in concert with the CIGAR alignment - in order to catch all edge cases. Some implementations insert 0's - at the end of inserts / deltions / variations to make parsing easier - but not everyone follows this. Look at the complex examples: the - CIGAR alignment may show an insert, but the MD just shows all matches. - - Examples: See: http://davetang.org/muse/2011/01/28/perl-and-sam/ - Also from CCBB actual mappings and manual altered (shortened, - made more complex) - (doctests included) - - Match/mismatch - CIGAR: 36M - MD:Z: 1A0C0C0C1T0C0T27 - MD:Z: 1ACCC1TCT27 (alternative) - 1 2 - 123456789012345678901234567890123456 - ref: CGATACGGGGACATCCGGCCTGCTCCTTCTCACATG - XXXX XXX - read: CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG - MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM - -ACCC-TCT--------------------------- - >>> list(_read_calc_variations(1, [(0,36)], '1A0C0C0C1T0C0T27', 'CACCCCTCTGACATCCGGCCTGCTCCTTCTCACATG')) - [(0, 2, 'A'), (0, 3, 'C'), (0, 4, 'C'), (0, 5, 'C'), (0, 7, 'T'), (0, 8, 'C'), (0, 9, 'T')] - - Insert - CIGAR: 6M1I29M - MD:Z: 0C1C0C1C0T0C27 - C1CC1CTC27 (alt) - 1 2 - 123456^789012345678901234567890123456 - ref: CACCCC^TCTGACATCCGGCCTGCTCCTTCTCACAT - X XX X|XX - read: GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT - MMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMM - G-GA-GGGG--------------------------- - >>> list(_read_calc_variations(1, [(0,6), (1,1), (0, 29)], '0C1C0C1C0T0C27', 'GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT')) - [(0, 1, 'G'), (0, 3, 'G'), (0, 4, 'A'), (0, 6, 'G'), (1, 7, 'G'), (0, 7, 'G'), (0, 8, 'G')] - >>> list(_read_calc_variations(1, [(0,6), (1,1), (0, 29)], 'C1CC1CTC27', 'GAGACGGGGTGACATCCGGCCTGCTCCTTCTCACAT')) - [(0, 1, 'G'), (0, 3, 'G'), (0, 4, 'A'), (0, 6, 'G'), (1, 7, 'G'), (0, 7, 'G'), (0, 8, 'G')] - - - Deletion - CIGAR: 9M9D27M - MD:Z: 2G0A5^ATGATGTCA27 - 2GA5^ATGATGTCA27 (alt) - ref: AGGAATGGGATGATGTCAGGGGTTCCAGGTGGAGACGAGGACTCC - XX ^^^^^^^^^ - read: AGTGATGGG^^^^^^^^^GGGGTTCCAGGTGGAGACGAGGACTCC - MMMMMMMMMDDDDDDDDDMMMMMMMMMMMMMMMMMMMMMMMMMMM - --TG-----ATGATGTCA--------------------------- - >>> list(_read_calc_variations(1, [(0,9), (2,9), (0, 27)], '2G0A5^ATGATGTCA27', 'AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC')) - [(0, 3, 'T'), (0, 4, 'G'), (2, 10, 'ATGATGTCA')] - - - Complex - CIGAR: 9M9D11M1I15M - MD:Z: 2G0A5^ATGATGTCAA26 - MD:Z: 2G0A5^ATGATGTCA0G26 (alt) - 1 2 3 4 - pos: 123456789012345678901234567890123456789012345 - ref: AGGAATGGGATGATGTCAGGGGTTCCAGG^GGAGACGAGGACTCC - XX ^^^^^^^^^X | - read: AGTGATGGG^^^^^^^^^AGGGTTCCAGGTGGAGACGAGGACTCC - MMMMMMMMMDDDDDDDDDMMMMMMMMMMMMMMMMMMMMMMMMMMM - --TG-----ATGATGTCAG----------T--------------- - >>> list(_read_calc_variations(1, [(0,9), (2,9), (0,11), (1,1), (0,15)], '2G0A5^ATGATGTCAA26', 'AGTGATGGGGGGGTTCCAGGTGGAGACGAGGACTCC')) - [(0, 3, 'T'), (0, 4, 'G'), (2, 10, 'ATGATGTCA'), (0, 19, 'G'), (1, 30, 'T')] - - - Complex example - inserts aren't separately handled by MD, only visible in CIGAR - CIGAR: 14M2D16M3I42M - MD:Z: 14^TC58 - 1 2 3 4 5 6 7 - pos: 12345678901234567890123456789012^^^345678901234567890123456789012345678901234567 - ref: caagtatcaccatgtcaggcatttttttcatt^^^tttgtagagagagaagacttgctatgttgcccaagctggcct - ^^ ||| - read: CAAGTATCACCATG^^AGGCATTTTTTTCATTTGGTTTGTAGAGAGAGAAGACTTGCTATGTTGCCCAAGCTGGCCT - MMMMMMMMMMMMMMDDMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM - --------------tc----------------TGG------------------------------------------ - >>> list(_read_calc_variations(1, [(0,14), (2,2), (0,16), (1,3), (0,42)], '14^TC58', 'CAAGTATCACCATGAGGCATTTTTTTCATTTGGTTTGTAGAGAGAGAAGACTTGCTATGTTGCCCAAGCTGGCCT')) - [(2, 15, 'TC'), (1, 33, 'TGG')] - - - Complex example 2: - CIGAR: 41M3I10M1I5M1I2M2I10M - MD:Z: 44C2C6T6T6 - 1 2 3 4 5 6 - pos: 12345678901234567890123456789012345678901^^^2345678901^23456^78^^9012345678 - ref: AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG - ||| X X | X | || X - read: AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG - MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM - -----------------------------------------tta---A--T---c---A----gt---G------ - - - 13M 28M 3I 10M 1I 5M 1I 2M 2I 10M - >>> list(_read_calc_variations(1, [(0, 41), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '44C2C6T6T6', 'AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG')) - [(1, 42, 'TTA'), (0, 45, 'A'), (0, 48, 'T'), (1, 52, 'C'), (0, 55, 'A'), (1, 57, 'G'), (1, 59, 'GT'), (0, 62, 'G')] - - - Splice junction example: - CIGAR: 62M100N13M - MD:Z: 2T27C44 - 1 1 - 1 2 3 4 5 6 6 7 - pos: 12345678901234567890123456789012345678901234567890123456789012| [100] |3456789012345 - ref: CCTCATGACCAGCTTGTTGAAGAGATCCGACATCAAGTGCCCACCTTGGCTCGTGGCTCTCA|-------|CTTGCTCCTGCTC - X X - read: CCGCATGACCAGCTTGTTGAAGAGATCCGATATCAAGTGCCCACCTTGGCTCGTGGCTCTCA|-------|CTTGCTCCTGCTC - --G---------------------------T----------------------------------------------------- - - >>> list(_read_calc_variations(1, [(0,62), (4,100), (0,13)], '2T27C44', 'CCGCATGACCAGCTTGTTGAAGAGATCCGATATCAAGTGCCCACCTTGGCTCGTGGCTCTCACTTGCTCCTGCTC')) - [(0, 3, 'G'), (0, 31, 'T')] - - - Splice junction example 2: - CIGAR: 13M100N28M3I10M1I5M1I2M2I10M - MD:Z: 44C2C6T6T6 - 1 1 1 1 1 - 1 2 3 4 5 6 - pos: 1234567890123| [100] |4567890123456789012345678901^^^2345678901^23456^78^^9012345678 - ref: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG - ||| X X | X | || X - read: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG - MMMMMMMMMMMMM MMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM - ------------- ----------------------------tta---A--T---c---A----gt---G------ - - 13M 100N 28M 3I 10M 1I 5M 1I 2M 2I 10M - >>> list(_read_calc_variations(1, [(0, 13), (3, 100), (0, 28), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '44C2C6T6T6', 'AGGGTGGCGAGATCGATGACGGCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG')) - [(1, 142, 'TTA'), (0, 145, 'A'), (0, 148, 'T'), (1, 152, 'C'), (0, 155, 'A'), (1, 157, 'G'), (1, 159, 'GT'), (0, 162, 'G')] - - - Splice junction example 2A: - CIGAR: 13M100N7M2D19M3I10M1I5M1I2M2I10M - MD:Z: 9A10^GG22C2C6T6T6 - 1 1 1 1 1 - 1 2 3 4 5 6 - pos: 1234567890123| [100] |4567890123456789012345678901^^^2345678901^23456^78^^9012345678 - ref: AGGGTGGCGAGAT|-------|CGATGACGGCATTGGCGATGGTGATCTT^^^GAGCCACATG^CGGTC^GC^^GGATCTCCAG - ^^ ||| X X | X | || X - read: AGGGTGGCGCGAT|-------|CGATGAC^^CATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG - MMMMMMMMMMMMM MMMMMMMDDMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMIMMMMMIMMIIMMMMMMMMMM - ---------C--- ----------------------------tta---A--T---c---A----gt---G------ - .........A... .......GG................... ...C..C... ...T. .. ...T...... - 9 A 10 ^GG 22 C 2C 6 T 6 T 6 - - >>> list(_read_calc_variations(1, [(0, 13), (3, 100), (0, 7), (2, 2), (0, 19), (1, 3), (0, 10), (1, 1), (0, 5), (1, 1), (0, 2), (1, 2), (0, 10)], '9A10^GG22C2C6T6T6', 'AGGGTGGCGCGATCGATGACCATTGGCGATGGTGATCTTTTAGAGACATATGCCGGACGGCGTGGAGCTCCAG')) - [(0, 10, 'C'), (2, 121, 'GG'), (1, 142, 'TTA'), (0, 145, 'A'), (0, 148, 'T'), (1, 152, 'C'), (0, 155, 'A'), (1, 157, 'G'), (1, 159, 'GT'), (0, 162, 'G')] - - Real Example - 242_1071_1799_B1 - CIGAR: 42M10I3M1D9M1D11M - MD:Z: 27G16A0^T6C2^T1C9 - 1 2 3 4 5 6 7 - pos: 123456789012345678901234567890123456789012 345678901234567890123456789012345 - ref: ACTGAGAAACCCAACCCTCTGAGACCAGCACACCCCTTTCAA^^^^^^^^^^GCATGTTCCTCCCTCCCCTTCTTTG - X X^ X ^ X - read: ACTGAGAAACCCAACCCTCTGAGACCAACACACCCCTTTCAACACATTTTTGGCC^GTTCCTGCC^CGCCTTCTTTG - MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIIIIIIIIMMMDMMMMMMMMMDMMMMMMMMMMM - ---------------------------A--------------^^^^^^^^^^--CT------G--T-G--------- - - >>> list(_read_calc_variations(1, [(0,42), (1,10), (0, 3), (2, 1), (0, 9), (2, 1), (0, 11)], '27G16A0^T6C2^T1C9', 'ACTGAGAAACCCAACCCTCTGAGACCAACACACCCCTTTCAACACATTTTTGGCCGTTCCTGCCCGCCTTCTTTG', )) - [(0, 28, 'A'), (1, 43, 'CACATTTTTG'), (0, 45, 'C'), (2, 46, 'T'), (0, 53, 'G'), (2, 56, 'T'), (0, 58, 'G')] - - - Real example 2 - 577_1692_891_A1 - CIGAR: 34M100N39M2I - MD:Z: 3T69 - 1 1 1 1 - 1 2 3 4 5 6 7 - pos: 1234567890123456789012345678901234| [100] |567890123456789012345678901234567890123 - ref: GGATTCTTCCCACTGGGTCGATGTTGTTTGTGAT|-------|CTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTC^^ - - read: GGAATCTTCCCACTGGGTCGATGTTGTTTGTGAT|-------|CTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTCTC - MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM NNNNN MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMII - ---A------------------------------ ---------------------------------------TC - - >>> list(_read_calc_variations(1, [(0,34), (3,100), (0, 39), (1, 2)], '3T69', 'GGAATCTTCCCACTGGGTCGATGTTGTTTGTGATCTGAGAGAGAGTTGCATCTGCACATGCTTTCCTGGCGTCTC', )) - [(0, 4, 'A'), (1, 174, 'TC')] - - ''' - - ref_pos = start_pos - read_pos = 0 - - for op, length in cigar: - if md and md[0] == '0': - md = md[1:] - # sys.stderr.write('%s, %s, %s\n' %(op, length, md)) - if op == 0: # M - # how far in the chunk are we? (do *not* update ref_pos until end) - md_pos = 0 - last = None - while md and md_pos < length: - if last == (op, length, md): - sys.stderr.write('\nInfinite loop in variant finding!\nPos: %s\nCIGAR: (%s, %s)\n' % (ref_pos, op, length)) - sys.exit(1) - last = (op, length, md) - # sys.stderr.write('%s, %s, %s\n' %(op, length, md)) - chunk_size, md = _extract_md_matches(md, length - md_pos) - # sys.stderr.write(' -> %s, %s\n' %(chunk_size, md)) - md_pos += chunk_size - - # look for mismatches - while md_pos < length and md and md[0] not in '0123456789^': - yield (op, ref_pos + md_pos, seq[read_pos + md_pos]) - md = md[1:] - - md_pos += 1 - - ref_pos += length - read_pos += length - - elif op == 1: # I - # nothing in MD about inserts... - yield (op, ref_pos, seq[read_pos:read_pos + length]) - read_pos += length - - elif op == 2: # D - # prefixed with '^' and includes all of the removed bases - if md[0] == '^': - md = md[1:] - yield (op, ref_pos, md[:length]) - md = md[length:] - ref_pos += length - - elif op == 3: # N - ref_pos += length - - -def read_calc_mismatches_gen(ref, read, chrom): - start = read.pos - ref_pos = 0 - read_pos = 0 - - for op, length in read.cigar: - if op == 1: - yield ref_pos, op, None - read_pos += length - elif op == 2: - yield ref_pos, op, None - ref_pos += length - elif op == 3: - ref_pos += length - elif op == 0: - refseq = ref.fetch(chrom, start + ref_pos, start + ref_pos + length) - if not refseq: - raise ValueError("Reference '%s' not found in FASTA file: %s" % (chrom, ref.filename)) - cur_pos = start + ref_pos - for refbase, readbase in zip(refseq.upper(), read.seq[read_pos:read_pos + length].upper()): - if refbase != readbase: - yield op, cur_pos, readbase - cur_pos += 1 - ref_pos += length - read_pos += length - else: - raise ValueError("Unsupported CIGAR operation: %s" % op) - - -def read_calc_mismatches_ref(ref, read, chrom): - edits = 0 - - for op, pos, base in read_calc_mismatches_gen(ref, read, chrom): - edits += 1 - - return edits - - -__region_cache = {} - - -def region_pos_to_genomic_pos(name, start, cigar): - ''' - converts a junction position to a genomic location given a junction - ref name, the junction position, and the cigar alignment. - - returns: (genomic ref, genomic pos, genomic cigar) - - >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050,3000-4000', 25, [(0, 100)]) - ('chr1', 1025, [(0, 25), (3, 950), (0, 50), (3, 950), (0, 25)]) - - >>> region_pos_to_genomic_pos('chr1:1000-1050,1050-1200', 25, [(0, 100)]) - ('chr1', 1025, [(0, 25), (3, 0), (0, 75)]) - - >>> region_pos_to_genomic_pos('chr3R:17630851-17630897,17634338-17634384', 17, [(0, 39)]) - ('chr3R', 17630868, [(0, 29), (3, 3441), (0, 10)]) - - >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(4, 25), (0, 50), (4, 25)]) - ('chr1', 1025, [(4, 25), (0, 25), (3, 950), (0, 25), (4, 25)]) - - >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(5, 25), (0, 75)]) - ('chr1', 1025, [(5, 25), (0, 25), (3, 950), (0, 50)]) - - >>> region_pos_to_genomic_pos('chr1:1000-1050,2000-2050', 25, [(5, 25), (0, 75)]) - ('chr1', 1025, [(5, 25), (0, 25), (3, 950), (0, 50)]) - - >>> region_pos_to_genomic_pos('chr7:16829153-16829246,16829246-16829339', 62, cigar_fromstr('83M18S')) - ('chr7', 16829215, [(0, 31), (3, 0), (0, 52), (4, 18)]) - - - ''' - - if name in __region_cache: - chrom, fragments = __region_cache[name] - else: - c1 = name.split(':') - chrom = c1[0] - - fragments = [] - for fragment in c1[1].split(','): - s, e = fragment.split('-') - fragments.append((int(s), int(e))) - - __region_cache[name] = (chrom, fragments) - - chr_cigar = [] - chr_start = fragments[0][0] - - read_start = int(start) - - frag_idx = 0 - frag_start = 0 - frag_end = 0 - - for i, (s, e) in enumerate(fragments): - if chr_start + read_start < e: - chr_start += read_start - frag_idx = i - frag_start = s - frag_end = e - break - - else: - chr_start += (e - s) - read_start -= (e - s) - - cur_pos = chr_start - - for op, length in cigar: - if op in [1, 4, 5]: - chr_cigar.append((op, length)) - - elif op in [0, 2]: - if cur_pos + length <= frag_end: - cur_pos += length - chr_cigar.append((op, length)) - - else: - while cur_pos + length > frag_end: - if frag_end - cur_pos > 0: - chr_cigar.append((op, frag_end - cur_pos)) - length -= (frag_end - cur_pos) - cur_pos = frag_end - frag_idx += 1 - if len(fragments) <= frag_idx: - print 'ERROR converting: ', name, fragments - return (chrom, 0, chr_cigar) - frag_start, frag_end = fragments[frag_idx] - chr_cigar.append((3, frag_start - cur_pos)) - cur_pos = frag_start - - cur_pos = cur_pos + length - chr_cigar.append((op, length)) - else: - print "Unsupported CIGAR operation (%s)" % bam_cigar[op] - sys.exit(1) - - return (chrom, chr_start, chr_cigar) - - -def is_junction_valid(cigar, min_overlap=4): - ''' - Does the genomic cigar alignment represent a 'good' alignment. - Used for checking junction->genome alignments - - 1) the alignment must not start at a splice junction - 2) the alignment must not start or end with an overhang - 3) the alignment must overhang the splice junction by min_overlap (4) - - | Exon1 | Intron | Exon2 | - |-----------------|oooooooooooooooo|------------------| - XXXXXXXXXXXXXXXXXXXXXXXX (bad 1) - XXXXXXXXXXXXX (bad 2) XXXXXXXXXXXXXXXX (bad 2) - XX-----------------XXXXXXXXXXXXXXXXX (bad 3) - - >>> is_junction_valid(cigar_fromstr('1000N40M')) - (False, 'Starts at gap (1000N40M)') - - >>> is_junction_valid(cigar_fromstr('100M')) - (False, "Doesn't cover junction") - - >>> is_junction_valid(cigar_fromstr('100M1000N3M'), 4) - (False, "Too short overlap at 3' (100M1000N3M)") - - >>> is_junction_valid(cigar_fromstr('2M1000N100M'), 4) - (False, "Too short overlap at 5' (2M1000N100M)") - - >>> is_junction_valid(cigar_fromstr('4M1000N100M'), 4) - (True, '') - - >>> is_junction_valid(cigar_fromstr('100M1000N4M'), 4) - (True, '') - - >>> is_junction_valid(cigar_fromstr('4M0N100M'), 4) - (True, '') - - ''' - first = True - pre_gap = True - - pre_gap_count = 0 - post_gap_count = 0 - - has_gap = False - - for op, length in cigar: - # mapping can't start at a gap - if first and op == 3: - return (False, 'Starts at gap (%s)' % cigar_tostr(cigar)) - first = False - - if op == 3: - pre_gap = False - post_gap_count = 0 - has_gap = True - - elif pre_gap: - pre_gap_count += length - else: - post_gap_count += length - - # mapping must start with more than min_overlap base match - - if not has_gap: - return (False, "Doesn't cover junction") - elif pre_gap_count < min_overlap: - return (False, "Too short overlap at 5' (%s)" % cigar_tostr(cigar)) - elif post_gap_count < min_overlap: - return (False, "Too short overlap at 3' (%s)" % cigar_tostr(cigar)) - - return True, '' - - -def read_alignment_fragments_gen(read): - ''' - Takes a read and returns the start and end positions for each match/mismatch region. - This will let us know where each read alignment "touches" the genome. - ''' - - for start, end in _read_alignment_fragments_gen(read.pos, read.cigar): - yield (start, end) - - -def _read_alignment_fragments_gen(pos, cigar): - ''' Test-able version of read_alignment_fragments_gen - >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('50M'))) - [(1, 51)] - - >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('25M100N25M'))) - [(1, 26), (126, 151)] - - >>> list(_read_alignment_fragments_gen(1, cigar_fromstr('20M1D4M100N10M5I10M'))) - [(1, 21), (22, 26), (126, 136), (136, 146)] - ''' - ref_pos = pos - - for op, length in cigar: - if op == 0: - yield (ref_pos, ref_pos + length) - ref_pos += length - elif op == 1: - pass - elif op == 2: - ref_pos += length - elif op == 3: - ref_pos += length - else: - raise ValueError("Unsupported CIGAR operation: %s" % op) - - -def read_cigar_at_pos(cigar, qpos, is_del): - ''' - Returns the CIGAR operation for a given read position - - qpos is the 0-based index of the base in the read - ''' - pos = 0 - returnnext = False - for op, length in cigar: - if returnnext: - return op - if op == 0: - pos += length - elif op == 1: - pos += length - elif op == 2: - pass - elif op == 3: - pass - elif op == 4: - pos += length - elif op == 5: - pass - else: - raise ValueError("Unsupported CIGAR operation: %s" % op) - - if pos > qpos: - return op - if is_del and pos == qpos: - returnnext = True - - return None - - -def cleancigar(cigar): - ''' - Cleans a CIGAR alignment to remove zero length operations - - >>> cigar_tostr(cleancigar(cigar_fromstr('31M0N52M18S'))) - '83M18S' - - ''' - newcigar = [] - - changed = False - zero = False - - for op, size in cigar: - if size > 0: - if zero and newcigar: - if newcigar[-1][0] == op: - newsize = newcigar[-1][1] + size - newcigar[-1] = (op, newsize) - zero = 0 - else: - newcigar.append((op, size)) - else: - changed = True - zero = True - - if changed: - return newcigar - - return None - - -def read_cleancigar(read): - ''' - Replaces the CIGAR string for a read to remove any operations that are zero length. - - This happens to the read in-place - ''' - if read.is_unmapped: - return False - - newcigar = [] - - newcigar = cleancigar(read.cigar) - - if newcigar: - read.cigar = newcigar - return True - - return False - - -def read_to_unmapped(read, ref=None): - ''' - Converts a read from mapped to unmapped. - - Sets the 'ZR' tag to indicate the original ref/pos/cigar (if ref is passed) - ''' - - newread = pysam.AlignedRead() - - if ref: - tags = [('ZR', '%s:%s:%s' % (ref, read.pos, cigar_tostr(read.cigar)))] - - newread.is_unmapped = True - newread.mapq = 0 - newread.tlen = 0 - newread.pos = -1 - newread.pnext = -1 - newread.rnext = -1 - newread.tid = -1 - - newread.qname = read.qname - - if read.is_paired: - newread.is_paired = True - - if not read.is_unmapped and read.is_reverse: - newread.seq = ngsutils.support.revcomp(read.seq) - newread.qual = read.qual[::-1] - else: - newread.seq = read.seq - newread.qual = read.qual - - newread.tags = tags - - return newread - - -if __name__ == '__main__': - import doctest - doctest.testmod() diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/bam/__init__.pyc Binary file ngsutils/bam/__init__.pyc has changed diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/bed/__init__.py --- a/ngsutils/bed/__init__.py Sun Nov 27 15:01:21 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,282 +0,0 @@ -import os - -import ngsutils.support.ngs_utils -import pysam - - -class BedStreamer(object): - ''' - Streams BedRegions from a BED file - - Note - this can only be used once! There is no mechanism to seek the stream. - ''' - - def __init__(self, fname=None, fileobj=None, quiet=False): - if not fname and not fileobj: - raise ValueError("You must specify either fname or fileobj!") - - self.reader = ngsutils.support.gzip_reader(fname=fname, quiet=quiet, fileobj=fileobj) - - def __iter__(self): - return self - - def next(self): - try: - while True: - line = self.reader.next().strip() - if line and line[0] != '#': - cols = line.split('\t') - while len(cols) < 6: - cols.append('') - - return BedRegion(*cols) - except: - raise StopIteration - - -class BedFile(object): - ''' - BED files are read in their entirety memory, in a series of bins. Each bin - is ~100kb in size. Each bin can then be iterated over. - - This is less efficient than using a proper index, but in reality, this - usually isn't an issue. However, if the BED file has been Tabix indexed, - that index will be used for random access. - - NOTE: This isn't very efficient, so perhaps this can be remodeled into a BedFile - and a BedFileIndex where the file is indexed only if random access is requested. - ''' - - _bin_const = 100000 - - def __init__(self, fname=None, fileobj=None, region=None): - self._bins = {} - self._bin_list = [] - self._cur_bin_idx = 0 - self._cur_bin_pos = 0 - self._tellpos = 0 - self._total = 0 - self._length = 0 - self.__tabix = None - - self.filename = fname - - if os.path.exists('%s.tbi' % fname): - self.__tabix = pysam.Tabixfile(fname) - - if fileobj: - self.__readfile(fileobj) - elif fname: - with ngsutils.support.ngs_utils.gzip_opener(fname) as fobj: - self.__readfile(fobj) - elif region: - chrom, startend = region.split(':') - if '-' in startend: - start, end = [int(x) for x in startend.split('-')] - else: - start = int(startend) - end = start - start -= 1 - - self.__add_region(BedRegion(chrom, start, end)) - else: - raise ValueError("Must specify either filename, fileobj, or region") - - def __readfile(self, fobj): - for line in fobj: - line = line.strip() - if line and line[0] != '#': - cols = line.split('\t') - while len(cols) < 6: - cols.append('') - - region = BedRegion(*cols) - self.__add_region(region) - - self._bin_list.sort() - for bin in self._bins: - self._bins[bin].sort() - - def __add_region(self, region): - self._total += region.end - region.start - self._length += 1 - - startbin = region.start / BedFile._bin_const - endbin = region.end / BedFile._bin_const - - for bin in xrange(startbin, endbin + 1): - if not (region.chrom, bin) in self._bins: - self._bin_list.append((region.chrom, bin)) - self._bins[(region.chrom, bin)] = [] - self._bins[(region.chrom, bin)].append(region) - - def fetch(self, chrom, start, end, strand=None): - ''' - For TABIX indexed BED files, find all regions w/in a range - - For non-TABIX index BED files, use the calculated bins, and - output matching regions - ''' - - if self.__tabix: - for match in self.__tabix.fetch(chrom, start, end): - region = BedRegion(*match.split('\t')) - if not strand or (strand and region.strand == strand): - yield region - else: - startbin = start / BedFile._bin_const - endbin = end / BedFile._bin_const - - buf = set() - - for bin in xrange(startbin, endbin + 1): - if (chrom, bin) in self._bins: - for region in self._bins[(chrom, bin)]: - if strand and strand != region.strand: - continue - if start <= region.start <= end or start <= region.end <= end: - if region not in buf: - yield region - buf.add(region) - elif region.start < start and region.end > end: - if region not in buf: - yield region - buf.add(region) - - def tell(self): - return self._tellpos - - def close(self): - pass - - @property - def length(self): - return self._length - - @property - def total(self): - return self._total - - def __iter__(self): - self._cur_bin_idx = 0 - self._cur_bin_pos = 0 - self._tellpos = 0 - return self - - def next(self): - if self._cur_bin_idx >= len(self._bin_list): - raise StopIteration - - binvals = self._bins[self._bin_list[self._cur_bin_idx]] - while self._cur_bin_pos < len(binvals): - val = binvals[self._cur_bin_pos] - self._cur_bin_pos += 1 - - startbin = (val.chrom, val.start / BedFile._bin_const) - if startbin == self._bin_list[self._cur_bin_idx]: - self._tellpos += 1 - return val - - self._cur_bin_idx += 1 - self._cur_bin_pos = 0 - return self.next() - - -class BedRegion(object): - def __init__(self, chrom, start, end, name='', score='', strand='', thickStart='', thickEnd='', rgb='', *args): - self.chrom = chrom - self.start = int(start) - self.end = int(end) - self.name = name - - if score == '': - self.score = 0 - else: - self.score = float(score) - - if strand == '': - self.strand = None - else: - self.strand = strand - - if thickStart == '': - self.thickStart = None - else: - self.thickStart = thickStart - - if thickEnd == '': - self.thickEnd = None - else: - self.thickEnd = thickEnd - - if rgb == '': - self.rgb = None - else: - self.rgb = rgb - - self.extras = args - - def clone(self, chrom=None, start=None, end=None, name=None, score=None, strand=None, thickStart=None, thickEnd=None, rgb=None, *args): - cols = [] - cols.append(self.chrom if chrom is None else chrom) - cols.append(self.start if start is None else start) - cols.append(self.end if end is None else end) - cols.append(self.name if name is None else name) - cols.append(self.score if score is None else score) - cols.append(self.strand if strand is None else strand) - cols.append(self.thickStart if thickStart is None else thickStart) - cols.append(self.thickEnd if thickEnd is None else thickEnd) - cols.append(self.rgb if rgb is None else rgb) - - for i, val in enumerate(self.extras): - if len(args) > i: - cols.append(args[i]) - else: - cols.append(val) - - return BedRegion(*cols) - - @property - def score_int(self): - score = str(self.score) - if score[-2:] == '.0': - score = score[:-2] - - return score - - def __key(self): - return (self.chrom, self.start, self.end, self.strand, self.name) - - def __lt__(self, other): - return self.__key() < other.__key() - - def __gt__(self, other): - return self.__key() > other.__key() - - def __eq__(self, other): - return self.__key() == other.__key() - - def write(self, out): - out.write('%s\n' % self) - - def __repr__(self): - outcols = [] - - if self.rgb: - outcols.append(self.rgb) - if self.thickEnd or outcols: - outcols.append(self.thickEnd if self.thickEnd else self.end) - if self.thickStart or outcols: - outcols.append(self.thickStart if self.thickStart else self.start) - if self.strand or outcols: - outcols.append(self.strand) - if self.score_int != '' or outcols: - outcols.append(self.score_int) - if self.name or outcols: - outcols.append(self.name) - - outcols.append(self.end) - outcols.append(self.start) - outcols.append(self.chrom) - - return '\t'. join([str(x) for x in outcols[::-1]]) diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/bed/__init__.pyc Binary file ngsutils/bed/__init__.pyc has changed diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/__init__.py --- a/ngsutils/support/__init__.py Sun Nov 27 15:01:21 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,254 +0,0 @@ -import collections -import gzip -import os -import re -import sys -try: - from eta import ETA -except: - pass - - -class FASTARead(collections.namedtuple('FASTARecord', 'name comment seq')): - def __repr__(self): - if self.comment: - return '>%s %s\n%s\n' % (self.name, self.comment, self.seq) - return '>%s\n%s\n' % (self.name, self.seq) - - def subseq(self, start, end, comment=None): - if self.comment: - comment = '%s %s' % (self.comment, comment) - - return FASTARead(self.name, comment, self.seq[start:end]) - - def clone(self, name=None, comment=None, seq=None): - n = name if name else self.name - c = comment if comment else self.comment - s = seq if seq else self.seq - - return FASTARead(n, c, s) - - def write(self, out): - out.write(repr(self)) - - -class FASTA(object): - def __init__(self, fname=None, fileobj=None, qual=False): - self.fname = fname - self.qual = qual - if fileobj: - self.fileobj = fileobj - else: - if self.fname == '-': - self.fileobj = sys.stdin - elif self.fname[-3:] == '.gz' or self.fname[-4:] == '.bgz': - self.fileobj = gzip.open(os.path.expanduser(self.fname)) - else: - self.fileobj = open(os.path.expanduser(self.fname)) - - if not self.fileobj: - raise ValueError("Missing valid filename or fileobj") - - def close(self): - if self.fileobj != sys.stdout: - self.fileobj.close() - - def tell(self): - # always relative to uncompressed... - return self.fileobj.tell() - - def seek(self, pos, whence=0): - self.fileobj.seek(pos, whence) - - def fetch(self, quiet=False): - name = '' - comment = '' - seq = '' - - if not quiet and self.fname and self.fname != '-': - eta = ETA(os.stat(self.fname).st_size, fileobj=self.fileobj) - else: - eta = None - - for line in self.fileobj: - line = line.strip() - if not line: - continue - if line[0] == '#': - continue - - if line[0] == '>': - if name and seq: - if eta: - eta.print_status(extra=name) - yield FASTARead(name, comment, seq) - - spl = re.split(r'[ \t]', line[1:], maxsplit=1) - name = spl[0] - if len(spl) > 1: - comment = spl[1] - else: - comment = '' - seq = '' - - else: - if self.qual: - seq = seq + ' ' + line - else: - seq += line - - if name and seq: - if eta: - eta.print_status(extra=name) - yield FASTARead(name, comment, seq) - - if eta: - eta.done() - - -def gzip_reader(fname, quiet=False, callback=None, done_callback=None, fileobj=None): - if fileobj: - f = fileobj - elif fname == '-': - f = sys.stdin - elif fname[-3:] == '.gz' or fname[-4:] == '.bgz': - f = gzip.open(os.path.expanduser(fname)) - else: - f = open(os.path.expanduser(fname)) - - if quiet or fname == '-': - eta = None - else: - eta = ETA(os.stat(fname).st_size, fileobj=f) - - for line in f: - if eta: - if callback: - extra = callback() - else: - extra = '' - - eta.print_status(extra=extra) - yield line - - if done_callback and done_callback(): - break - - if f != sys.stdin: - f.close() - - if eta: - eta.done() - - -class Symbolize(object): - 'Converts strings to symbols - basically a cache of strings' - - def __init__(self): - self.__cache = {} - - def __getitem__(self, k): - if k not in self.__cache: - self.__cache[k] = k - - return self.__cache[k] - - -symbols = Symbolize() - -_compliments = { - 'a': 't', - 'A': 'T', - 'c': 'g', - 'C': 'G', - 'g': 'c', - 'G': 'C', - 't': 'a', - 'T': 'A', - 'n': 'n', - 'N': 'N' -} - - -def revcomp(seq): - ''' - >>> revcomp('ATCGatcg') - 'cgatCGAT' - ''' - ret = [] - - for s in seq: - ret.append(_compliments[s]) - - ret.reverse() - return ''.join(ret) - - -class Counts(object): - ''' - Setup simple binning. Bins are continuous 0->max. Values are added to - bins and then means / distributions can be calculated. - ''' - - def __init__(self): - self.bins = [] - - def add(self, val): - while len(self.bins) <= val: - self.bins.append(0) - self.bins[val] += 1 - - def mean(self): - acc = 0 - count = 0 - - for i, val in enumerate(self.bins): - acc += (i * val) - count += val - - if count > 0: - return float(acc) / count - - def max(self): - return len(self.bins) - 1 - - -def memoize(func): - if 'TESTING' in os.environ or 'DEBUG' in os.environ: - return func - - __cache = {} - - def inner(*args, **kwargs): - k = (args, tuple(kwargs.iteritems())) - if k not in __cache: - __cache[k] = func(*args, **kwargs) - return __cache[k] - - inner.__doc__ = '(@memoized %s)\n%s' % (func.__name__, func.__doc__) - return inner - - -def quoted_split(s, delim, quote_char='"'): - tokens = [] - - buf = "" - inquote = False - - for c in s: - if inquote: - buf += c - if c == quote_char: - inquote = False - elif c == delim: - tokens.append(buf) - buf = "" - else: - buf += c - if c == quote_char: - inquote = True - - if buf: - tokens.append(buf) - - return tokens diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/__init__.pyc Binary file ngsutils/support/__init__.pyc has changed diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/bgzip.py --- a/ngsutils/support/bgzip.py Sun Nov 27 15:01:21 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,138 +0,0 @@ -#!/usr/bin/env python -''' -Extract the blocks from a BGZip file. - -BAM files are stored as blocks in a bgzip archive. This class -will load the bgzip archive and output the block information. -''' - -import os -import struct -import sys - - -class BGZip(object): - def __init__(self, fname): - self.fname = fname - self.pos = 0 - self.fileobj = open(self.fname) - self.fsize = os.stat(self.fname).st_size - self.cur_chunk = 0 - - self.cpos = 0 - self.cdata = 0 - - def close(self): - self.fileobj.close() - - def seek(self, offset): - bgz_offset = offset >> 16 - chunk_offset = offset & 0xFFFF - - self.fileobj.seek(bgz_offset) - self.read_chunk() - self.chunk_pos = chunk_offset - - def read(self, amount, whence=0): - if whence not in [0, 1]: - print "Bad Whence value!: %s" % whence - return - - if whence == 0: - self.seek(0, 0) - - # read into chunk, if not enough data in chunk, read next chunk - ret = '' - while amount and self.pos < self.fsize: - if len(self.cdata) - self.cpos < amount: - ret += self.cdata[self.cpos:self.cpos + amount] - self.cpos += amount - return ret - else: - ret += self.cdata[self.cpos:] - amount = amount - len(ret) - self.read_chunk() - return ret - - def read_chunk(self): - self.fileobj.seek(10) - id1, id2, cm, flg, mtime, xfl, os, xlen = self._read_fields('', op, chrom, pos, base - print snp - print snp.alleles - if exit: - sys.exit(1) - - def is_valid_variation(self, chrom, op, pos, seq, verbose=False): - for snp in self.fetch(chrom, pos): - if '/' not in snp.observed or snp.clazz not in ['single', 'mixed', 'in-del', 'insertion', 'deletion']: - # these are odd variations that we can't deal with... (microsatellites, tooLongToDisplay members, etc) - continue - - if op == 0: - if snp.clazz in ['single', 'mixed'] and seq in snp.alleles: - return True - # elif verbose: - # for alt in snp.alleles: - # if len(alt) > 1: - # self.dump(chrom, op, pos, seq, snp, False) - - elif op == 1: - if snp.clazz in ['insertion', 'mixed', 'in-del']: - if seq in snp.alleles: - if len(seq) > 1 and verbose: - self.dump(chrom, op, pos, seq, snp, False) - return True - - if verbose: - if len(seq) > 1: - self.dump(chrom, op, pos, seq, snp, False) - else: - for alt in snp.alleles: - if len(alt) > 1: - self.dump(chrom, op, pos, seq, snp, False) - - elif op == 2: - if snp.clazz in ['deletion', 'mixed', 'in-del']: - if '-' in snp.alleles and seq in snp.alleles: - if len(seq) > 1 and verbose: - self.dump(chrom, op, pos, seq, snp, False) - return True - - return False diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/dbsnp.pyc Binary file ngsutils/support/dbsnp.pyc has changed diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/llh.py --- a/ngsutils/support/llh.py Sun Nov 27 15:01:21 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,56 +0,0 @@ -''' -Methods for calculating log-likelihoods for nucleotide frequencies -''' -import collections -import math - -from ngsutils.support import memoize - -_default_background = {'A': 0.3, 'T': 0.3, 'C': 0.2, 'G': 0.2} - -NucleotideLogLikelihood = collections.namedtuple('NucleotideLogLikelihood', 'A C G T pseudo') - - -@memoize -def pseudo_count(N, bg): - ''' - >>> pseudo_count(100, _default_background['A']) - 3 - >>> pseudo_count(100, _default_background['C']) - 2 - ''' - - return bg * math.sqrt(N) - - -def calc_llh(A, C, G, T, bg=_default_background, pseudo='auto'): - if pseudo == 'auto': - N = A + C + G + T - Ap = float(A) + pseudo_count(N, bg['A']) - Cp = float(C) + pseudo_count(N, bg['C']) - Gp = float(G) + pseudo_count(N, bg['G']) - Tp = float(T) + pseudo_count(N, bg['T']) - elif pseudo: - Ap = float(A) + pseudo - Cp = float(C) + pseudo - Gp = float(G) + pseudo - Tp = float(T) + pseudo - else: - Ap = float(A) - Cp = float(C) - Gp = float(G) - Tp = float(T) - - Np = Ap + Cp + Gp + Tp - - freqA = Ap / Np - freqC = Cp / Np - freqG = Gp / Np - freqT = Tp / Np - - return NucleotideLogLikelihood(math.log(freqA / bg['A']), math.log(freqC / bg['C']), math.log(freqG / bg['G']), math.log(freqT / bg['T']), pseudo) - - -if __name__ == '__main__': - import doctest - doctest.testmod() diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/ngs_utils.py --- a/ngsutils/support/ngs_utils.py Sun Nov 27 15:01:21 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,225 +0,0 @@ -#!/usr/bin/env python -""" -Common util classes / functions for the NGS project -""" -import collections -import gzip -import os -import re -import sys - - -def format_number(n): - ''' - >>> format_number(1000) - '1,000' - >>> format_number(1234567) - '1,234,567' - ''' - ar = list(str(n)) - for i in range(len(ar))[::-3][1:]: - ar.insert(i + 1, ',') - return ''.join(ar) - - -def natural_sort(ar): - ''' - >>> natural_sort('1 3 4 2 5'.split()) - ['1', '2', '3', '4', '5'] - >>> natural_sort('1 10 20 2 3 4'.split()) - ['1', '2', '3', '4', '10', '20'] - ''' - to_sort = [] - for item in ar: - spl = re.split('(\d+)', item) - l2 = [] - for el in spl: - try: - n = int(el) - except: - n = el - l2.append(n) - to_sort.append((l2, item)) - - to_sort.sort() - return [x[1] for x in to_sort] - - -def dictify(values, colnames): - """ - Convert a list of values into a dictionary based upon given column names. - - If the column name starts with an '@', the value is assumed to be a comma - separated list. - - If the name starts with a '#', the value is assumed to be an int. - - If the name starts with '@#', the value is assumed to a comma separated - list of ints. - - """ - d = {} - for i in xrange(len(colnames)): - key = colnames[i] - split = False - num = False - - if key[0] == '@': - key = key[1:] - split = True - if key[0] == '#': - key = key[1:] - num = True - - if i < len(values): - if num and split: - val = [int(x) for x in values[i].rstrip(',').split(',')] - elif num: - val = int(values[i]) - elif split: - val = values[i].rstrip(',').split(',') - else: - val = values[i] - - d[key] = val - - else: - d[key] = None - - return d - - -def gzip_aware_open(fname): - if fname == '-': - f = sys.stdin - elif fname[-3:] == '.gz' or fname[-4:] == '.bgz': - f = gzip.open(os.path.expanduser(fname)) - else: - f = open(os.path.expanduser(fname)) - return f - - -class gzip_opener: - ''' - A Python 2.6 class to handle 'with' opening of text files that may - or may not be gzip compressed. - ''' - - def __init__(self, fname): - self.fname = fname - - def __enter__(self): - self.f = gzip_aware_open(self.fname) - return self.f - - def __exit__(self, type, value, traceback): - if self.f != sys.stdin: - self.f.close() - return False - - -def filenames_to_uniq(names, new_delim='.'): - ''' - Given a set of file names, produce a list of names consisting of the - uniq parts of the names. This works from the end of the name. Chunks of - the name are split on '.' and '-'. - - For example: - A.foo.bar.txt - B.foo.bar.txt - returns: ['A','B'] - - AA.BB.foo.txt - CC.foo.txt - returns: ['AA.BB','CC'] - - >>> filenames_to_uniq('a.foo.bar.txt b.foo.bar.txt'.split()) - ['a', 'b'] - >>> filenames_to_uniq('a.b.foo.txt c.foo.txt'.split()) - ['a.b', 'c'] - - ''' - name_words = [] - maxlen = 0 - for name in names: - name_words.append(name.replace('.', ' ').replace('-', ' ').strip().split()) - name_words[-1].reverse() - if len(name_words[-1]) > maxlen: - maxlen = len(name_words[-1]) - - common = [False, ] * maxlen - for i in xrange(maxlen): - last = None - same = True - for nameword in name_words: - if i >= len(nameword): - same = False - break - if not last: - last = nameword[i] - elif nameword[i] != last: - same = False - break - common[i] = same - - newnames = [] - for nameword in name_words: - nn = [] - for (i, val) in enumerate(common): - if not val and i < len(nameword): - nn.append(nameword[i]) - nn.reverse() - newnames.append(new_delim.join(nn)) - return newnames - - -def parse_args(argv, defaults=None, expected_argc=0): - opts = {} - if defaults: - opts.update(defaults) - - args = [] - - i = 0 - while i < len(argv): - if argv[i][0] == '-': - arg = argv[i].lstrip('-') - if '=' in arg: - k, v = arg.split('=', 2) - if k in defaults: - if type(defaults[k]) == float: - opts[k] = float(v) - elif type(defaults[k]) == int: - opts[k] = int(v) - else: - opts[k] = v - else: - opts[arg] = True - else: - args.append(argv[i]) - i += 1 - - while len(args) < expected_argc: - args.append(None) - return opts, args - - -class memoize(object): - 'Simple memoizing decorator to cache results' - - def __init__(self, func): - self.func = func - self.cache = {} - - def __call__(self, *args): - if not isinstance(args, collections.Hashable): - # uncacheable. a list, for instance. - # better to not cache than blow up. - return self.func(*args) - - if args in self.cache: - return self.cache[args] - else: - value = self.func(*args) - self.cache[args] = value - return value diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/ngs_utils.pyc Binary file ngsutils/support/ngs_utils.pyc has changed diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/regions.py --- a/ngsutils/support/regions.py Sun Nov 27 15:01:21 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,166 +0,0 @@ -class RangeMatch(object): - ''' - Simple genomic ranges. You can define chrom:start-end ranges, then ask if a - particular genomic coordinate maps to any of those ranges. This is less- - efficient than an R-Tree, but easier to code. - ''' - - def __init__(self, name): - self.ranges = {} - self.name = name - - def add_range(self, chrom, strand, start, end): - if chrom not in self.ranges: - self.ranges[chrom] = {} - - bin = start / 100000 - if bin not in self.ranges[chrom]: - self.ranges[chrom][bin] = [] - self.ranges[chrom][bin].insert(0, (start, end, strand)) - - if (end / 100000) != bin: - for bin in xrange(bin + 1, (end / 100000) + 1): - if bin not in self.ranges[chrom]: - self.ranges[chrom][bin] = [] - self.ranges[chrom][bin].insert(0, (start, end, strand)) - - def get_tag(self, chrom, strand, pos, ignore_strand=False): - ''' - returns (region, is_reverse_orientation) - ''' - if chrom not in self.ranges: - return None, False - bin = pos / 100000 - if bin not in self.ranges[chrom]: - return None, False - for start, end, r_strand in self.ranges[chrom][bin]: - if pos >= start and pos <= end: - if ignore_strand or strand == r_strand: - return self.name, False - return self.name, True - return None, False - - -class RegionTagger(object): - def __init__(self, gtf, valid_chroms=None, only_first_fragment=True): - self.regions = [] - self.counts = {} - self.only_first_fragment = only_first_fragment - - coding = RangeMatch('coding') - exons = RangeMatch('other-exon') - utr_5 = RangeMatch('utr-5') - utr_3 = RangeMatch('utr-3') - introns = RangeMatch('intron') - promoters = RangeMatch('promoter') - - for gene in gtf.genes: - if valid_chroms and gene.chrom not in valid_chroms: - continue - if gene.strand == '+': - promoters.add_range(gene.chrom, gene.strand, gene.start - 2000, gene.start) - else: - promoters.add_range(gene.chrom, gene.strand, gene.end, gene.end + 2000) - - for transcript in gene.transcripts: - if transcript.has_cds: - for start, end in transcript.cds: - coding.add_range(gene.chrom, gene.strand, start, end) - - # TODO: Fix this so that it iterates over exons in the 5'/3' UTRS - for s, e in transcript.utr_5: - utr_5.add_range(gene.chrom, gene.strand, s, e) - for s, e in transcript.utr_3: - utr_3.add_range(gene.chrom, gene.strand, s, e) - - last_end = None - for start, end in transcript.exons: - if last_end: - introns.add_range(gene.chrom, gene.strand, last_end, start) - exons.add_range(gene.chrom, gene.strand, start, end) - last_end = end - - self.regions.append(coding) - self.regions.append(utr_5) - self.regions.append(utr_3) - self.regions.append(exons) - self.regions.append(introns) - self.regions.append(promoters) - - self.counts['coding'] = 0 - self.counts['coding-rev'] = 0 - self.counts['other-exon'] = 0 - self.counts['utr-5'] = 0 - self.counts['utr-3'] = 0 - self.counts['utr-5-rev'] = 0 - self.counts['utr-3-rev'] = 0 - self.counts['intron'] = 0 - self.counts['promoter'] = 0 - self.counts['other-exon-rev'] = 0 - self.counts['intron-rev'] = 0 - self.counts['promoter-rev'] = 0 - self.counts['junction'] = 0 - self.counts['intergenic'] = 0 - self.counts['mitochondrial'] = 0 - - def add_read(self, read, chrom): - if read.is_unmapped: - return - - if self.only_first_fragment and read.is_paired and not read.is_read1: - return - - tag = None - is_rev = False - - strand = '-' if read.is_reverse else '+' - - if chrom == 'chrM': - tag = 'mitochondrial' - - if not tag: - for op, length in read.cigar: - if op == 3: - tag = 'junction' - break - - if not tag: - for region in self.regions: - tag, is_rev = region.get_tag(chrom, strand, read.pos) - if tag: - break - - if not tag: - tag = 'intergenic' - - if tag: - if is_rev: - self.counts['%s-rev' % tag] += 1 - else: - self.counts[tag] += 1 - - return tag - - def tag_region(self, chrom, start, end, strand): - tag = None - is_rev = False - - if chrom == 'chrM' or chrom == 'M': - tag = 'mitochondrial' - - if not tag: - for region in self.regions: - tag, is_rev = region.get_tag(chrom, strand, start) - if is_rev: - tag = '%s-rev' % tag - - if start != end: - endtag, is_rev = region.get_tag(chrom, strand, end) - if is_rev: - endtag = '%s-rev' % endtag - - if tag and endtag and endtag != tag: - tag = '%s/%s' % (tag, endtag) - - if not tag: - tag = 'intergenic' diff -r 7a68005de299 -r 9b9ae5963d3c ngsutils/support/stats.py --- a/ngsutils/support/stats.py Sun Nov 27 15:01:21 2016 -0500 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,166 +0,0 @@ -''' -various statistical tests and methods... -''' -import math - -from ngsutils.support import memoize - - -def median(vals): - ''' - >>> median([1,2,3]) - 2 - >>> median([1,2,3,4]) - 2.5 - ''' - vals.sort() - - if len(vals) % 2 == 1: - return vals[len(vals) / 2] - else: - a = vals[(len(vals) / 2) - 1] - b = vals[(len(vals) / 2)] - return float(a + b) / 2 - - -def mean_stdev(l): - ''' - >>> mean_stdev([1,2,2,2]) - (1.75, 0.5) - >>> mean_stdev([2,2,2,2]) - (2.0, 0.0) - ''' - - acc = 0 - for el in l: - acc += el - - mean = float(acc) / len(l) - acc = 0 - for el in l: - acc += (el - mean) ** 2 - - if len(l) > 2: - stdev = math.sqrt(float(acc) / (len(l) - 1)) - else: - stdev = 0.0 - - return (mean, stdev) - - -def counts_median(d): - ''' - Calculate the median from counts stored in a dictionary - >>> counts_median({ 1: 4, 2: 1, 3: 4 }) - 2 - >>> counts_median({ 1: 4, 3: 4 }) - 2 - - ''' - count = 0 - for k in d: - count += d[k] - - if count == 0: - return 0 - - acc = 0.0 - last = 0 - for k in sorted(d): - if last: - return (last + k) / 2 - acc += d[k] - if acc / count == 0.5: - last = k - elif acc / count > 0.5: - return k - - -def counts_mean_stdev(d): - ''' - - calc mean / stdev when data is stored as counts in a dictionary - - Ex: - { 1: 4, 2: 1, 3: 4 } = [1, 1, 1, 1, 2, 3, 3, 3, 3] - - >>> counts_mean_stdev({ 1: 4, 2: 1, 3: 4 }) - (2.0, 1.0) - - ''' - - acc = 0 - count = 0 - for k in d: - acc += k * d[k] - count += d[k] - - mean = float(acc) / count - - acc = 0 - for k in d: - acc += (((k - mean) ** 2) * d[k]) - - if count > 2: - stdev = math.sqrt(float(acc) / (count - 1)) - else: - stdev = 0.0 - - return (mean, stdev) - - -@memoize -def poisson_prob(x, mean): - ''' - Return the probability that you could get x counts in - a Poisson test with a mean value. - - prob(x) = sum(i=1..x){poisson(i)} - - >>> poisson_prob(6,10) - 0.1300960209527205 - >>> poisson_prob(8,10) - 0.33277427882095645 - ''' - acc = 0.0 - for i in xrange(1, x + 1): - acc += poisson_func(i, mean) - return acc - - -@memoize -def poisson_func(mu, lambd): - ''' - This is the Poisson distribution function - - p(mu) = (lambda^mu * e^(-lambda)) / (mu!) - - mu is a count - lambd is the mean - - >>> poisson_func(1,10) - 0.00045399929762484856 - >>> poisson_func(2,10) - 0.0022699964881242427 - >>> poisson_func(3,10) - 0.007566654960414142 - ''' - return (lambd ** mu) * (math.exp(-1 * lambd)) / _factorial(mu) - - -@memoize -def _factorial(x): - ''' - >>> _factorial(1) - 1 - >>> _factorial(2) - 2 - >>> _factorial(3) - 6 - ''' - return math.factorial(x) - - -if __name__ == '__main__': - import doctest - doctest.testmod()