Mercurial > repos > nick > allele_counts

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/0todo.txt	Tue Mar 31 05:09:12 2020 -0400
@@ -0,0 +1,2 @@
+test handling of -c 0 (and -f 0?)
+should it technically handle data lines that start with a '#'?
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.md	Tue Mar 31 05:09:12 2020 -0400
@@ -0,0 +1,4 @@
+variant-annotator
+=================
+
+A Galaxy tool for parsing variant counts from a VCF and computing statistics
--- a/allele-counts.py	Tue Aug 23 02:30:56 2016 -0400
+++ b/allele-counts.py	Tue Mar 31 05:09:12 2020 -0400
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 """
 Run with -h option or see DESCRIPTION for description.
 This script's functionality is being obsoleted by the new, and much more sanely
@@ -11,7 +11,6 @@
 Naive Variant Caller variant count parsing one-liner:
 $ cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:'
 """
-from __future__ import division
 import os
 import sys
 import errno
@@ -49,6 +48,7 @@
 threshold (but not necessarily in the same order). If the site fails this test,
 the number of alleles is reported as 0."""

+
 def get_options(defaults, usage, description='', epilog=''):
   """Get options, print usage text."""

@@ -124,7 +124,6 @@
     if len(coords) > 2: print_sample = coords[2]

   # set infile_handle to either stdin or the input file
-  global infile_handle
   if infile == OPT_DEFAULTS.get('infile'):
     infile_handle = sys.stdin
     sys.stderr.write("Reading from standard input..\n")
@@ -135,7 +134,6 @@
       fail('Error: Input VCF file '+infile+' not found.')

   # set outfile_handle to either stdout or the output file
-  global outfile_handle
   if outfile == OPT_DEFAULTS.get('outfile'):
     outfile_handle = sys.stdout
   else:
@@ -186,23 +184,18 @@
           sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n")
           sys.exit(1)

-
     site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS,
       freq_thres, covg_thres, stranded, debug=debug)

     if debug and site_summary[0]['print']:
-        print line.split('\t')[9].split(':')[-1]
+      print(line.split('\t')[9].split(':')[-1])

     try:
       print_site(outfile_handle, site_summary, COLUMNS)
     except IOError as ioe:
       if ioe.errno == errno.EPIPE:
-        cleanup()
         sys.exit(0)

-  # close any open filehandles
-  cleanup()
-
   # keeps Galaxy from giving an error if there were messages on stderr
   sys.exit(0)

@@ -341,7 +334,7 @@
         sample[strand+base_count[0]] = base_count[1]
       # fill in any zeros
       for base in canonical:
-        if not sample.has_key(strand+base):
+        if strand+base not in sample:
           sample[strand+base] = 0

     sample['alleles'] = count_alleles(variants, freq_thres, debug=debug)
@@ -354,7 +347,7 @@
         ranked_bases[1] = ranked_bases[2]
         ranked_bases[2] = tmp_base

-    if debug: print "ranked +-: "+str(ranked_bases)
+    if debug: print("ranked +-: "+str(ranked_bases))

     sample['coverage'] = coverage
     try:
@@ -399,7 +392,7 @@
     if strand in strands:
       summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0)

-  return summed_counts.items()
+  return list(summed_counts.items())


 def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False):
@@ -426,10 +419,10 @@
     variant_counts.sort(reverse=True, key=lambda variant: variant[1])

   if debug:
-    print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
+    print('coverage: '+str(coverage)+', freq_thres: '+str(freq_thres))
     for variant in variant_counts:
-      print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+
-        str(variant[1]/coverage))
+      print((variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+
+        str(variant[1]/coverage)))

   # remove bases below the frequency threshold
   if freq_thres > 0:
@@ -455,8 +448,8 @@
     sort=False, debug=debug)

   if debug:
-    print '+ '+str(alleles_plus)
-    print '- '+str(alleles_minus)
+    print('+ '+str(alleles_plus))
+    print('- '+str(alleles_minus))

   # Check if each strand reports the same set of alleles.
   # Sorting by base is to compare lists without regard to order (as sets).
@@ -495,17 +488,9 @@


 def fail(message):
-  cleanup()
   sys.stderr.write(message+'\n')
   sys.exit(1)


-def cleanup():
-  if isinstance(infile_handle, file):
-    infile_handle.close()
-  if isinstance(outfile_handle, file):
-    outfile_handle.close()
-
-
 if __name__ == "__main__":
   main()
\ No newline at end of file
--- a/allele-counts.xml	Tue Aug 23 02:30:56 2016 -0400
+++ b/allele-counts.xml	Tue Mar 31 05:09:12 2020 -0400
@@ -1,6 +1,10 @@
-<tool id="allele_counts_1" version="1.2" name="Variant Annotator">
+<tool id="allele_counts_1" version="1.3" name="Variant Annotator">
   <description> process variant counts</description>
-  <command interpreter="python">allele-counts.py -i $input -o $output -f $freq -c $covg $header $stranded $nofilt
+  <stdio>
+    <exit_code range="1:" level="fatal" />
+    <exit_code range=":-1" level="fatal" />
+  </stdio>
+  <command>allele-counts.py -i $input -o $output -f $freq -c $covg $header $stranded $nofilt
   #if $seed:
     -r $seed
   #end if
@@ -15,12 +19,8 @@
     <param name="seed" type="text" value="" label="PRNG seed" />
   </inputs>
   <outputs>
-    <data name="output" format="tabular"/>
+    <data name="output" format="tabular" />
   </outputs>
-  <stdio>
-    <exit_code range="1:" err_level="fatal"/>
-    <exit_code range=":-1" err_level="fatal"/>
-  </stdio>

   <tests>
     <test>
@@ -114,4 +114,40 @@

   </help>

+  <citations>
+    <citation type="bibtex">
+      @article{Blankenberg2014,
+        author = {Blankenberg, Daniel and {Von Kuster}, Gregory and Bouvier, Emil and Baker, Dannon and Afgan, Enis and Stoler, Nicholas and Taylor, James and Nekrutenko, Anton},
+        doi = {10.1186/gb4161},
+        issn = {1465-6906},
+        journal = {Genome Biology},
+        keywords = {galaxy},
+        number = {2},
+        pages = {403},
+        title = {{Dissemination of scientific software with Galaxy ToolShed}},
+        url = {http://genomebiology.biomedcentral.com/articles/10.1186/gb4161},
+        volume = {15},
+        year = {2014}
+      }
+    </citation>
+    <citation type="bibtex">
+      @article{Dickins2014,
+        archivePrefix = {arXiv},
+        arxivId = {15334406},
+        author = {Dickins, Benjamin and Rebolledo-Jaramillo, Boris and Su, Marcia Shu Wei and Paul, Ian M and Blankenberg, Daniel and Stoler, Nicholas and Makova, Kateryna D and Nekrutenko, Anton},
+        doi = {10.2144/000114146},
+        eprint = {15334406},
+        isbn = {5049880467},
+        issn = {19409818},
+        journal = {BioTechniques},
+        number = {3},
+        pages = {134--141},
+        pmid = {24641477},
+        title = {{Controlling for contamination in re-sequencing studies with a reproducible web-based phylogenetic approach}},
+        volume = {56},
+        year = {2014}
+      }
+    </citation>
+  </citations>
+
 </tool>
--- a/tests/artificial-nofilt.csv.out	Tue Aug 23 02:30:56 2016 -0400
+++ b/tests/artificial-nofilt.csv.out	Tue Mar 31 05:09:12 2020 -0400
@@ -16,11 +16,11 @@
 THYROID	chr1	150	0	0	4	0	4	1	G	.	0.0	.
 THYROID	chr1	160	0	0	3	0	3	0	G	.	0.0	.
 THYROID	chr1	260	106	0	14	0	120	2	A	G	0.11667	2.4
-THYROID	chr1	300	2	0	2	76	80	3	T	G	0.025	0.0
-THYROID	chr1	310	12	0	12	76	100	3	T	G	0.12	0.0
-THYROID	chr1	320	12	0	12	56	80	3	T	A	0.15	0.64394
+THYROID	chr1	300	2	0	2	76	80	3	T	A	0.025	0.0
+THYROID	chr1	310	12	0	12	76	100	3	T	A	0.12	0.0
+THYROID	chr1	320	12	0	12	56	80	3	T	G	0.15	0.64394
 THYROID	chr1	330	7	0	7	66	80	3	T	G	0.0875	1.06247
-THYROID	chr1	340	1	0	1	98	100	0	T	G	0.01	5.21053
+THYROID	chr1	340	1	0	1	98	100	0	T	A	0.01	1.22222
 THYROID	chr1	350	11	0	11	78	100	0	T	A	0.11	1.25352
 THYROID	chr1	400	32	0	8	0	40	2	A	G	0.2	0.0
 THYROID	chr1	410	1	0	2	97	100	0	T	G	0.02	5.5
--- a/tests/artificial.csv.out	Tue Aug 23 02:30:56 2016 -0400
+++ b/tests/artificial.csv.out	Tue Mar 31 05:09:12 2020 -0400
@@ -21,11 +21,11 @@
 THYROID	chr1	240	180	0	20	0	200	2	A	G	0.1	0.0
 THYROID	chr1	250	178	0	22	0	200	2	A	G	0.11	0.0
 THYROID	chr1	260	106	0	14	0	120	0	A	G	0.11667	2.4
-THYROID	chr1	300	2	0	2	76	80	1	T	G	0.025	0.0
-THYROID	chr1	310	12	0	12	76	100	3	T	G	0.12	0.0
-THYROID	chr1	320	12	0	12	56	80	3	T	A	0.15	0.64394
+THYROID	chr1	300	2	0	2	76	80	1	T	A	0.025	0.0
+THYROID	chr1	310	12	0	12	76	100	3	T	A	0.12	0.0
+THYROID	chr1	320	12	0	12	56	80	3	T	G	0.15	0.64394
 THYROID	chr1	330	7	0	7	66	80	0	T	G	0.0875	1.06247
-THYROID	chr1	340	1	0	1	98	100	1	T	G	0.01	5.21053
+THYROID	chr1	340	1	0	1	98	100	1	T	A	0.01	1.22222
 THYROID	chr1	350	11	0	11	78	100	0	T	A	0.11	1.25352
 THYROID	chr1	400	32	0	8	0	40	2	A	G	0.2	0.0
 THYROID	chr1	410	1	0	2	97	100	0	T	G	0.02	5.5
--- a/tests/real-nofilt.csv.out	Tue Aug 23 02:30:56 2016 -0400
+++ b/tests/real-nofilt.csv.out	Tue Mar 31 05:09:12 2020 -0400
@@ -7,8 +7,8 @@
 THYROID	chr1	246704437	5	130	0	0	135	0	C	A	0.03704	2.14286
 THYROID	chr1	246707878	0	0	131	0	131	1	G	.	0.0	.
 THYROID	chr1	246714587	30	0	43	0	73	2	G	A	0.41096	1.22996
-THYROID	chr1	246729215	1	0	1	88	90	0	T	G	0.01111	11.125
-THYROID	chr1	246729216	1	0	1	90	92	0	T	G	0.01087	9.1
+THYROID	chr1	246729215	1	0	1	88	90	0	T	A	0.01111	1.08537
+THYROID	chr1	246729216	1	0	1	90	92	0	T	A	0.01087	1.10976
 THYROID	chr1	246729378	16	7	0	0	23	0	A	C	0.30435	.
 THYROID	chr1	246729392	29	0	10	0	39	0	A	G	0.25641	.
 THYROID	chr7	91502881	0	0	0	26	26	1	T	.	0.0	.
--- a/tests/real.csv.out	Tue Aug 23 02:30:56 2016 -0400
+++ b/tests/real.csv.out	Tue Mar 31 05:09:12 2020 -0400
@@ -6,6 +6,6 @@
 THYROID	chr1	246704437	5	130	0	0	135	0	C	A	0.03704	2.14286
 THYROID	chr1	246707878	0	0	131	0	131	1	G	.	0.0	.
 THYROID	chr1	246714587	30	0	43	0	73	2	G	A	0.41096	1.22996
-THYROID	chr1	246729216	1	0	1	90	92	0	T	G	0.01087	9.1
+THYROID	chr1	246729216	1	0	1	90	92	0	T	A	0.01087	1.10976
 THYROID	chr7	91502881	0	0	0	26	26	1	T	.	0.0	.
 THYROID	chr7	91502897	7	36	0	0	43	0	C	A	0.16279	1.79167
--- a/tests/run-tests.py	Tue Aug 23 02:30:56 2016 -0400
+++ b/tests/run-tests.py	Tue Mar 31 05:09:12 2020 -0400
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 import os
 import sys
 import subprocess
@@ -58,7 +58,7 @@
   test_dir = os.path.relpath(test_dir)

   if do_print_xml:
-    print XML.get('tests_start')
+    print(XML.get('tests_start'))

   for dataset in DATASETS:
     infile  = os.path.join(test_dir, dataset+IN_EXT)
@@ -78,13 +78,13 @@
       run_tests(infile, outfile, options, script_dir)

   if do_print_xml:
-    print XML.get('tests_end')
+    print(XML.get('tests_end'))


 def run_tests(infile, outfile, options, script_dir):
   script_cmd = os.path.join(script_dir, SCRIPT_NAME)+' '+options+' -i '+infile
   bash_cmd = 'diff '+outfile+' <('+script_cmd+')'
-  print script_cmd
+  print(script_cmd)
   subprocess.call(['bash', '-c', bash_cmd])


@@ -94,29 +94,28 @@

   options = options_str.split()  # on whitespace

-  print xml.get('test_start')
-  print xml.get('input') % infile
+  print(xml.get('test_start'))
+  print(xml.get('input') % infile)

   # read in options one at a time, print <param> line
   i = 0
   while i < len(options):
     opt = options[i]
-    if not params.has_key(opt) or not param_arg.has_key(opt):
-      sys.stderr.write("Error: unknown option '"+opt+"' in ARGS list in file "
-        +infile+"\n")
+    if opt not in params or opt not in param_arg:
+      sys.stderr.write("Error: unknown option '"+opt+"' in ARGS list in file "+infile+"\n")
       sys.exit(1)
     # takes argument
     if param_arg[opt]:
       i+=1
       arg = options[i]
-      print xml.get('param') % (params[opt], arg)
+      print(xml.get('param') % (params[opt], arg))
     # no argument (boolean)
     else:
-      print xml.get('param') % (params[opt], 'true')
+      print(xml.get('param') % (params[opt], 'true'))
     i+=1

-  print xml.get('output') % outfile
-  print xml.get('test_end')
+  print(xml.get('output') % outfile)
+  print(xml.get('test_end'))


 def read_options(infile):