Mercurial > repos > nick > allele_counts
changeset 9:6cc488e11544 draft
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
author | nick |
---|---|
date | Tue, 31 Mar 2020 05:09:12 -0400 |
parents | 411adeff1eec |
children | 7f19e8c03358 |
files | 0todo.txt README.md allele-counts.py allele-counts.xml tests/artificial-nofilt.csv.out tests/artificial.csv.out tests/real-nofilt.csv.out tests/real.csv.out tests/run-tests.py |
diffstat | 9 files changed, 83 insertions(+), 57 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/0todo.txt Tue Mar 31 05:09:12 2020 -0400 @@ -0,0 +1,2 @@ +test handling of -c 0 (and -f 0?) +should it technically handle data lines that start with a '#'? \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Tue Mar 31 05:09:12 2020 -0400 @@ -0,0 +1,4 @@ +variant-annotator +================= + +A Galaxy tool for parsing variant counts from a VCF and computing statistics
--- a/allele-counts.py Tue Aug 23 02:30:56 2016 -0400 +++ b/allele-counts.py Tue Mar 31 05:09:12 2020 -0400 @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 """ Run with -h option or see DESCRIPTION for description. This script's functionality is being obsoleted by the new, and much more sanely @@ -11,7 +11,6 @@ Naive Variant Caller variant count parsing one-liner: $ cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:' """ -from __future__ import division import os import sys import errno @@ -49,6 +48,7 @@ threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0.""" + def get_options(defaults, usage, description='', epilog=''): """Get options, print usage text.""" @@ -124,7 +124,6 @@ if len(coords) > 2: print_sample = coords[2] # set infile_handle to either stdin or the input file - global infile_handle if infile == OPT_DEFAULTS.get('infile'): infile_handle = sys.stdin sys.stderr.write("Reading from standard input..\n") @@ -135,7 +134,6 @@ fail('Error: Input VCF file '+infile+' not found.') # set outfile_handle to either stdout or the output file - global outfile_handle if outfile == OPT_DEFAULTS.get('outfile'): outfile_handle = sys.stdout else: @@ -186,23 +184,18 @@ sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n") sys.exit(1) - site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS, freq_thres, covg_thres, stranded, debug=debug) if debug and site_summary[0]['print']: - print line.split('\t')[9].split(':')[-1] + print(line.split('\t')[9].split(':')[-1]) try: print_site(outfile_handle, site_summary, COLUMNS) except IOError as ioe: if ioe.errno == errno.EPIPE: - cleanup() sys.exit(0) - # close any open filehandles - cleanup() - # keeps Galaxy from giving an error if there were messages on stderr sys.exit(0) @@ -341,7 +334,7 @@ sample[strand+base_count[0]] = base_count[1] # fill in any zeros for base in canonical: - if not sample.has_key(strand+base): + if strand+base not in sample: sample[strand+base] = 0 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug) @@ -354,7 +347,7 @@ ranked_bases[1] = ranked_bases[2] ranked_bases[2] = tmp_base - if debug: print "ranked +-: "+str(ranked_bases) + if debug: print("ranked +-: "+str(ranked_bases)) sample['coverage'] = coverage try: @@ -399,7 +392,7 @@ if strand in strands: summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0) - return summed_counts.items() + return list(summed_counts.items()) def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False): @@ -426,10 +419,10 @@ variant_counts.sort(reverse=True, key=lambda variant: variant[1]) if debug: - print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres) + print('coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)) for variant in variant_counts: - print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+ - str(variant[1]/coverage)) + print((variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+ + str(variant[1]/coverage))) # remove bases below the frequency threshold if freq_thres > 0: @@ -455,8 +448,8 @@ sort=False, debug=debug) if debug: - print '+ '+str(alleles_plus) - print '- '+str(alleles_minus) + print('+ '+str(alleles_plus)) + print('- '+str(alleles_minus)) # Check if each strand reports the same set of alleles. # Sorting by base is to compare lists without regard to order (as sets). @@ -495,17 +488,9 @@ def fail(message): - cleanup() sys.stderr.write(message+'\n') sys.exit(1) -def cleanup(): - if isinstance(infile_handle, file): - infile_handle.close() - if isinstance(outfile_handle, file): - outfile_handle.close() - - if __name__ == "__main__": main() \ No newline at end of file
--- a/allele-counts.xml Tue Aug 23 02:30:56 2016 -0400 +++ b/allele-counts.xml Tue Mar 31 05:09:12 2020 -0400 @@ -1,6 +1,10 @@ -<tool id="allele_counts_1" version="1.2" name="Variant Annotator"> +<tool id="allele_counts_1" version="1.3" name="Variant Annotator"> <description> process variant counts</description> - <command interpreter="python">allele-counts.py -i $input -o $output -f $freq -c $covg $header $stranded $nofilt + <stdio> + <exit_code range="1:" level="fatal" /> + <exit_code range=":-1" level="fatal" /> + </stdio> + <command>allele-counts.py -i $input -o $output -f $freq -c $covg $header $stranded $nofilt #if $seed: -r $seed #end if @@ -15,12 +19,8 @@ <param name="seed" type="text" value="" label="PRNG seed" /> </inputs> <outputs> - <data name="output" format="tabular"/> + <data name="output" format="tabular" /> </outputs> - <stdio> - <exit_code range="1:" err_level="fatal"/> - <exit_code range=":-1" err_level="fatal"/> - </stdio> <tests> <test> @@ -114,4 +114,40 @@ </help> + <citations> + <citation type="bibtex"> + @article{Blankenberg2014, + author = {Blankenberg, Daniel and {Von Kuster}, Gregory and Bouvier, Emil and Baker, Dannon and Afgan, Enis and Stoler, Nicholas and Taylor, James and Nekrutenko, Anton}, + doi = {10.1186/gb4161}, + issn = {1465-6906}, + journal = {Genome Biology}, + keywords = {galaxy}, + number = {2}, + pages = {403}, + title = {{Dissemination of scientific software with Galaxy ToolShed}}, + url = {http://genomebiology.biomedcentral.com/articles/10.1186/gb4161}, + volume = {15}, + year = {2014} + } + </citation> + <citation type="bibtex"> + @article{Dickins2014, + archivePrefix = {arXiv}, + arxivId = {15334406}, + author = {Dickins, Benjamin and Rebolledo-Jaramillo, Boris and Su, Marcia Shu Wei and Paul, Ian M and Blankenberg, Daniel and Stoler, Nicholas and Makova, Kateryna D and Nekrutenko, Anton}, + doi = {10.2144/000114146}, + eprint = {15334406}, + isbn = {5049880467}, + issn = {19409818}, + journal = {BioTechniques}, + number = {3}, + pages = {134--141}, + pmid = {24641477}, + title = {{Controlling for contamination in re-sequencing studies with a reproducible web-based phylogenetic approach}}, + volume = {56}, + year = {2014} + } + </citation> + </citations> + </tool>
--- a/tests/artificial-nofilt.csv.out Tue Aug 23 02:30:56 2016 -0400 +++ b/tests/artificial-nofilt.csv.out Tue Mar 31 05:09:12 2020 -0400 @@ -16,11 +16,11 @@ THYROID chr1 150 0 0 4 0 4 1 G . 0.0 . THYROID chr1 160 0 0 3 0 3 0 G . 0.0 . THYROID chr1 260 106 0 14 0 120 2 A G 0.11667 2.4 -THYROID chr1 300 2 0 2 76 80 3 T G 0.025 0.0 -THYROID chr1 310 12 0 12 76 100 3 T G 0.12 0.0 -THYROID chr1 320 12 0 12 56 80 3 T A 0.15 0.64394 +THYROID chr1 300 2 0 2 76 80 3 T A 0.025 0.0 +THYROID chr1 310 12 0 12 76 100 3 T A 0.12 0.0 +THYROID chr1 320 12 0 12 56 80 3 T G 0.15 0.64394 THYROID chr1 330 7 0 7 66 80 3 T G 0.0875 1.06247 -THYROID chr1 340 1 0 1 98 100 0 T G 0.01 5.21053 +THYROID chr1 340 1 0 1 98 100 0 T A 0.01 1.22222 THYROID chr1 350 11 0 11 78 100 0 T A 0.11 1.25352 THYROID chr1 400 32 0 8 0 40 2 A G 0.2 0.0 THYROID chr1 410 1 0 2 97 100 0 T G 0.02 5.5
--- a/tests/artificial.csv.out Tue Aug 23 02:30:56 2016 -0400 +++ b/tests/artificial.csv.out Tue Mar 31 05:09:12 2020 -0400 @@ -21,11 +21,11 @@ THYROID chr1 240 180 0 20 0 200 2 A G 0.1 0.0 THYROID chr1 250 178 0 22 0 200 2 A G 0.11 0.0 THYROID chr1 260 106 0 14 0 120 0 A G 0.11667 2.4 -THYROID chr1 300 2 0 2 76 80 1 T G 0.025 0.0 -THYROID chr1 310 12 0 12 76 100 3 T G 0.12 0.0 -THYROID chr1 320 12 0 12 56 80 3 T A 0.15 0.64394 +THYROID chr1 300 2 0 2 76 80 1 T A 0.025 0.0 +THYROID chr1 310 12 0 12 76 100 3 T A 0.12 0.0 +THYROID chr1 320 12 0 12 56 80 3 T G 0.15 0.64394 THYROID chr1 330 7 0 7 66 80 0 T G 0.0875 1.06247 -THYROID chr1 340 1 0 1 98 100 1 T G 0.01 5.21053 +THYROID chr1 340 1 0 1 98 100 1 T A 0.01 1.22222 THYROID chr1 350 11 0 11 78 100 0 T A 0.11 1.25352 THYROID chr1 400 32 0 8 0 40 2 A G 0.2 0.0 THYROID chr1 410 1 0 2 97 100 0 T G 0.02 5.5
--- a/tests/real-nofilt.csv.out Tue Aug 23 02:30:56 2016 -0400 +++ b/tests/real-nofilt.csv.out Tue Mar 31 05:09:12 2020 -0400 @@ -7,8 +7,8 @@ THYROID chr1 246704437 5 130 0 0 135 0 C A 0.03704 2.14286 THYROID chr1 246707878 0 0 131 0 131 1 G . 0.0 . THYROID chr1 246714587 30 0 43 0 73 2 G A 0.41096 1.22996 -THYROID chr1 246729215 1 0 1 88 90 0 T G 0.01111 11.125 -THYROID chr1 246729216 1 0 1 90 92 0 T G 0.01087 9.1 +THYROID chr1 246729215 1 0 1 88 90 0 T A 0.01111 1.08537 +THYROID chr1 246729216 1 0 1 90 92 0 T A 0.01087 1.10976 THYROID chr1 246729378 16 7 0 0 23 0 A C 0.30435 . THYROID chr1 246729392 29 0 10 0 39 0 A G 0.25641 . THYROID chr7 91502881 0 0 0 26 26 1 T . 0.0 .
--- a/tests/real.csv.out Tue Aug 23 02:30:56 2016 -0400 +++ b/tests/real.csv.out Tue Mar 31 05:09:12 2020 -0400 @@ -6,6 +6,6 @@ THYROID chr1 246704437 5 130 0 0 135 0 C A 0.03704 2.14286 THYROID chr1 246707878 0 0 131 0 131 1 G . 0.0 . THYROID chr1 246714587 30 0 43 0 73 2 G A 0.41096 1.22996 -THYROID chr1 246729216 1 0 1 90 92 0 T G 0.01087 9.1 +THYROID chr1 246729216 1 0 1 90 92 0 T A 0.01087 1.10976 THYROID chr7 91502881 0 0 0 26 26 1 T . 0.0 . THYROID chr7 91502897 7 36 0 0 43 0 C A 0.16279 1.79167
--- a/tests/run-tests.py Tue Aug 23 02:30:56 2016 -0400 +++ b/tests/run-tests.py Tue Mar 31 05:09:12 2020 -0400 @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import os import sys import subprocess @@ -58,7 +58,7 @@ test_dir = os.path.relpath(test_dir) if do_print_xml: - print XML.get('tests_start') + print(XML.get('tests_start')) for dataset in DATASETS: infile = os.path.join(test_dir, dataset+IN_EXT) @@ -78,13 +78,13 @@ run_tests(infile, outfile, options, script_dir) if do_print_xml: - print XML.get('tests_end') + print(XML.get('tests_end')) def run_tests(infile, outfile, options, script_dir): script_cmd = os.path.join(script_dir, SCRIPT_NAME)+' '+options+' -i '+infile bash_cmd = 'diff '+outfile+' <('+script_cmd+')' - print script_cmd + print(script_cmd) subprocess.call(['bash', '-c', bash_cmd]) @@ -94,29 +94,28 @@ options = options_str.split() # on whitespace - print xml.get('test_start') - print xml.get('input') % infile + print(xml.get('test_start')) + print(xml.get('input') % infile) # read in options one at a time, print <param> line i = 0 while i < len(options): opt = options[i] - if not params.has_key(opt) or not param_arg.has_key(opt): - sys.stderr.write("Error: unknown option '"+opt+"' in ARGS list in file " - +infile+"\n") + if opt not in params or opt not in param_arg: + sys.stderr.write("Error: unknown option '"+opt+"' in ARGS list in file "+infile+"\n") sys.exit(1) # takes argument if param_arg[opt]: i+=1 arg = options[i] - print xml.get('param') % (params[opt], arg) + print(xml.get('param') % (params[opt], arg)) # no argument (boolean) else: - print xml.get('param') % (params[opt], 'true') + print(xml.get('param') % (params[opt], 'true')) i+=1 - print xml.get('output') % outfile - print xml.get('test_end') + print(xml.get('output') % outfile) + print(xml.get('test_end')) def read_options(infile):