Mercurial > repos > peterjc > count_roi_variants
changeset 2:167765a633c0 draft default tip
"Update all the pico_galaxy tools on main Tool Shed"
author | peterjc |
---|---|
date | Fri, 16 Apr 2021 22:34:00 +0000 |
parents | 3ee6f4d0ac80 |
children | |
files | tools/count_roi_variants/README.rst tools/count_roi_variants/count_roi_variants.py tools/count_roi_variants/count_roi_variants.xml tools/count_roi_variants/tool_dependencies.xml |
diffstat | 4 files changed, 152 insertions(+), 48 deletions(-) [+] |
line wrap: on
line diff
--- a/tools/count_roi_variants/README.rst Tue May 16 09:14:18 2017 -0400 +++ b/tools/count_roi_variants/README.rst Fri Apr 16 22:34:00 2021 +0000 @@ -83,6 +83,7 @@ v0.0.5 - Fix samtools dependency version inconsistency, using v1.2 now. - Use ``<command detect_errors="aggressive">`` (internal change only). - Single quote command line arguments (internal change only). +v0.0.6 - Python 3 compatibility fix. ======= ======================================================================
--- a/tools/count_roi_variants/count_roi_variants.py Tue May 16 09:14:18 2017 -0400 +++ b/tools/count_roi_variants/count_roi_variants.py Fri Apr 16 22:34:00 2021 +0000 @@ -20,7 +20,7 @@ if "-v" in sys.argv or "--version" in sys.argv: # Galaxy seems to invert the order of the two lines - print("BAM coverage statistics v0.0.4 (using samtools)") + print("BAM coverage statistics v0.0.6 (using samtools)") cmd = "samtools 2>&1 | grep -i ^Version" sys.exit(os.system(cmd)) @@ -37,7 +37,8 @@ AGCCCATGAGATGGGAAGCAATGGGCTACA 14 87.50 AGCCCATGAGATGGGAAGCAATGGGCTACG 1 6.25 AGCGCATGAGATGGGAAGCAATGGGCTACG 1 6.25 -""" +""" # noqa: E501 + if len(sys.argv) == 5: bam_filename, bai_filename, tabular_filename, region = sys.argv[1:] else: @@ -84,7 +85,7 @@ def decode_cigar(cigar): - """Returns a list of 2-tuples, integer count and operator char.""" + """Return a list of 2-tuples, integer count and operator char.""" count = "" answer = [] for letter in cigar: @@ -98,30 +99,38 @@ return answer -assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [(14, 'S'), (15, 'M'), (1, 'P'), - (1, 'D'), (3, 'P'), (54, 'M'), - (1, 'D'), (34, 'M'), (5, 'S')] +assert decode_cigar("14S15M1P1D3P54M1D34M5S") == [ + (14, "S"), + (15, "M"), + (1, "P"), + (1, "D"), + (3, "P"), + (54, "M"), + (1, "D"), + (34, "M"), + (5, "S"), +] def align_len(cigar_ops): - """Sums the CIGAR M/=/X/D/N operators.""" + """Sum the CIGAR M/=/X/D/N operators.""" return sum(count for count, op in cigar_ops if op in "M=XDN") def expand_cigar(seq, cigar_ops): - """Yields (ref_offset, seq_base) pairs.""" + """Yield (ref_offset, seq_base) pairs.""" ref_offset = 0 seq_offset = 0 for count, op in cigar_ops: if op in "MX=": - for (i, base) in enumerate(seq[seq_offset:seq_offset + count]): + for (i, base) in enumerate(seq[seq_offset : seq_offset + count]): yield ref_offset + i, base ref_offset += count seq_offset += count elif op == "I": # Give them all an in-between reference position # (Python lets us mix integers and floats, wouldn't work in C) - for (i, base) in enumerate(seq[seq_offset:seq_offset + count]): + for (i, base) in enumerate(seq[seq_offset : seq_offset + count]): yield ref_offset - 0.5, base # Does not change ref_offset seq_offset += count @@ -142,31 +151,105 @@ raise NotImplementedError("Unexpected CIGAR operator %s" % op) -assert list(expand_cigar("ACGT", decode_cigar("4M"))) == [(0, "A"), (1, "C"), (2, "G"), (3, "T")] -assert list(expand_cigar("ACGT", decode_cigar("2=1X1="))) == [(0, "A"), (1, "C"), (2, "G"), (3, "T")] -assert list(expand_cigar("ACGT", decode_cigar("2M1D2M"))) == [(0, "A"), (1, "C"), (3, "G"), (4, "T")] -assert list(expand_cigar("ACtGT", decode_cigar("2M1I2M"))) == [(0, "A"), (1, "C"), (1.5, "t"), (2, "G"), (3, "T")] -assert list(expand_cigar("tACGT", decode_cigar("1I4M"))) == [(-0.5, 't'), (0, 'A'), (1, 'C'), (2, 'G'), (3, 'T')] -assert list(expand_cigar("ACGTt", decode_cigar("4M1I"))) == [(0, 'A'), (1, 'C'), (2, 'G'), (3, 'T'), (3.5, 't')] -assert list(expand_cigar("AAAAGGGGTTTT", decode_cigar("12M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), - (3, 'A'), (4, 'G'), (5, 'G'), - (6, 'G'), (7, 'G'), (8, 'T'), - (9, 'T'), (10, 'T'), (11, 'T')] -assert list(expand_cigar("AAAAcGGGGTTTT", decode_cigar("4M1I8M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), - (3, 'A'), (3.5, 'c'), (4, 'G'), - (5, 'G'), (6, 'G'), (7, 'G'), - (8, 'T'), (9, 'T'), (10, 'T'), - (11, 'T')] -assert list(expand_cigar("AAAAGGGGcTTTT", decode_cigar("8M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), - (3, 'A'), (4, 'G'), (5, 'G'), - (6, 'G'), (7, 'G'), (7.5, "c"), - (8, 'T'), (9, 'T'), (10, 'T'), - (11, 'T')] -assert list(expand_cigar("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"))) == [(0, 'A'), (1, 'A'), (2, 'A'), - (3, 'A'), (3.5, 'c'), (4, 'G'), - (5, 'G'), (6, 'G'), (7, 'G'), - (7.5, 'c'), (8, 'T'), (9, 'T'), - (10, 'T'), (11, 'T')] +assert list(expand_cigar("ACGT", decode_cigar("4M"))) == [ + (0, "A"), + (1, "C"), + (2, "G"), + (3, "T"), +] +assert list(expand_cigar("ACGT", decode_cigar("2=1X1="))) == [ + (0, "A"), + (1, "C"), + (2, "G"), + (3, "T"), +] +assert list(expand_cigar("ACGT", decode_cigar("2M1D2M"))) == [ + (0, "A"), + (1, "C"), + (3, "G"), + (4, "T"), +] +assert list(expand_cigar("ACtGT", decode_cigar("2M1I2M"))) == [ + (0, "A"), + (1, "C"), + (1.5, "t"), + (2, "G"), + (3, "T"), +] +assert list(expand_cigar("tACGT", decode_cigar("1I4M"))) == [ + (-0.5, "t"), + (0, "A"), + (1, "C"), + (2, "G"), + (3, "T"), +] +assert list(expand_cigar("ACGTt", decode_cigar("4M1I"))) == [ + (0, "A"), + (1, "C"), + (2, "G"), + (3, "T"), + (3.5, "t"), +] +assert list(expand_cigar("AAAAGGGGTTTT", decode_cigar("12M"))) == [ + (0, "A"), + (1, "A"), + (2, "A"), + (3, "A"), + (4, "G"), + (5, "G"), + (6, "G"), + (7, "G"), + (8, "T"), + (9, "T"), + (10, "T"), + (11, "T"), +] +assert list(expand_cigar("AAAAcGGGGTTTT", decode_cigar("4M1I8M"))) == [ + (0, "A"), + (1, "A"), + (2, "A"), + (3, "A"), + (3.5, "c"), + (4, "G"), + (5, "G"), + (6, "G"), + (7, "G"), + (8, "T"), + (9, "T"), + (10, "T"), + (11, "T"), +] +assert list(expand_cigar("AAAAGGGGcTTTT", decode_cigar("8M1I4M"))) == [ + (0, "A"), + (1, "A"), + (2, "A"), + (3, "A"), + (4, "G"), + (5, "G"), + (6, "G"), + (7, "G"), + (7.5, "c"), + (8, "T"), + (9, "T"), + (10, "T"), + (11, "T"), +] +assert list(expand_cigar("AAAAcGGGGcTTTT", decode_cigar("4M1I4M1I4M"))) == [ + (0, "A"), + (1, "A"), + (2, "A"), + (3, "A"), + (3.5, "c"), + (4, "G"), + (5, "G"), + (6, "G"), + (7, "G"), + (7.5, "c"), + (8, "T"), + (9, "T"), + (10, "T"), + (11, "T"), +] def get_roi(seq, cigar_ops, start, end): @@ -184,7 +267,9 @@ return seq[start:end] # Would use "start <= i < end" if they were all integers, but # want to exclude e.g. 3.5 and 7.5 when given start 4 and end 8. - return "".join(base for i, base in expand_cigar(seq, cigar_ops) if start <= i <= end - 1) + return "".join( + base for i, base in expand_cigar(seq, cigar_ops) if start <= i <= end - 1 + ) assert "GGGG" == get_roi("AAAAGGGGTTTT", decode_cigar("12M"), 4, 8) @@ -203,15 +288,31 @@ # Could recreate the region string (with no commas in start/end)? # region = "%s:%i-%i" % (ref, start, end) - tally = dict() + tally = {} # Call samtools view, don't need header so no -h added. # Only want mapped reads, thus flag filter -F 4. - child = subprocess.Popen(["samtools", "view", "-F", "4", bam_file, region], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) + child = subprocess.Popen( + ["samtools", "view", "-F", "4", bam_file, region], + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) for line in child.stdout: assert line[0] != "@", "Got unexpected SAM header line: %s" % line - qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, rest = line.split("\t", 10) + ( + qname, + flag, + rname, + pos, + mapq, + cigar, + rnext, + pnext, + tlen, + seq, + rest, + ) = line.split("\t", 10) pos = int(pos) # one-based if start < pos: # Does not span the ROI @@ -235,9 +336,11 @@ if return_code: sys.exit("Got return code %i from samtools view" % return_code) elif "specifies an unknown reference name. Continue anyway." in stderr: - sys.exit(stderr.strip() + - "\n\nERROR: samtools did not recognise the region requested, " - "can't count any variants.") + sys.exit( + stderr.strip() + + "\n\nERROR: samtools did not recognise the region requested, " + "can't count any variants." + ) return tally
--- a/tools/count_roi_variants/count_roi_variants.xml Tue May 16 09:14:18 2017 -0400 +++ b/tools/count_roi_variants/count_roi_variants.xml Fri Apr 16 22:34:00 2021 +0000 @@ -1,4 +1,4 @@ -<tool id="count_roi_variants" name="Count sequence variants in region of interest" version="0.0.5"> +<tool id="count_roi_variants" name="Count sequence variants in region of interest" version="0.0.6"> <description>using samtools view</description> <requirements> <requirement type="package" version="1.2">samtools</requirement> @@ -96,7 +96,7 @@ Heng Li et al (2009). The Sequence Alignment/Map format and SAMtools. Bioinformatics 25(16), 2078-9. -http://dx.doi.org/10.1093/bioinformatics/btp352 +https://doi.org/10.1093/bioinformatics/btp352 Peter J.A. Cock (2016), Count sequence variants in region of interest in BAM file. http://toolshed.g2.bx.psu.edu/view/peterjc/count_roi_variants
--- a/tools/count_roi_variants/tool_dependencies.xml Tue May 16 09:14:18 2017 -0400 +++ b/tools/count_roi_variants/tool_dependencies.xml Fri Apr 16 22:34:00 2021 +0000 @@ -1,6 +1,6 @@ -<?xml version="1.0"?> +<?xml version="1.0" ?> <tool_dependency> <package name="samtools" version="1.2"> - <repository changeset_revision="f6ae3ba3f3c1" name="package_samtools_1_2" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" /> + <repository name="package_samtools_1_2" owner="iuc" toolshed="https://toolshed.g2.bx.psu.edu" changeset_revision="f6ae3ba3f3c1"/> </package> -</tool_dependency> +</tool_dependency> \ No newline at end of file