genome_diversity: filter_gd_snp.xml comparison

comparison filter_gd_snp.xml @ 27:8997f2ca8c7a

Update to Miller Lab devshed revision bae0d3306d3b

author	Richard Burhans <burhans@bx.psu.edu>
date	Mon, 15 Jul 2013 10:47:35 -0400
parents	95a05c1ef5d5
children	a631c2f6d913

comparison

equal deleted inserted replaced

-:91e835060ad2
+:8997f2ca8c7a
-<tool id="gd_filter_gd_snp" name="Filter SNPs" version="1.1.0">
+<tool id="gd_filter_gd_snp" name="Filter SNPs" version="1.2.0">
-<description>: Discard some SNPs based on coverage or quality</description>
+<description>: Discard some SNPs based on coverage, quality or spacing</description>
 <command interpreter="python">
-filter_gd_snp.py "$input" "$p1_input" "$output" "$lo_coverage" "$hi_coverage" "$low_ind_cov" "$lo_quality"
+#import json
-#for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns)
+#import base64
-#set $arg = '%s:%s' % ($individual_col, $individual)
+#import zlib
-"$arg"
+#set $ind_names = $input.dataset.metadata.individual_names
-#end for
+#set $ind_colms = $input.dataset.metadata.individual_columns
+#set $ind_dict = dict(zip($ind_names, $ind_colms))
+#set $ind_json = json.dumps($ind_dict, separators=(',',':'))
+#set $ind_comp = zlib.compress($ind_json, 9)
+#set $ind_arg = base64.b64encode($ind_comp)
+filter_gd_snp.py '$input' '$output'
+#if str($input.dataset.metadata.dbkey) == '?'
+'0'
+#else
+'$input.dataset.metadata.ref'
+#end if
+'$min_spacing' '$lo_genotypes' '$input_type.p1_input'
+#if $input_type.choice == '0'
+'gd_snp' '$input_type.lo_coverage' '$input_type.hi_coverage' '$input_type.low_ind_cov' '$input_type.lo_quality'
+#else if $input_type.choice == '1'
+'gd_genotype' '0' '0' '0' '0'
+#end if
+'$ind_arg'
 </command>
 <inputs>
-<param name="input" type="data" format="gd_snp" label="SNP dataset" />
+<conditional name="input_type">
-<param name="p1_input" type="data" format="gd_indivs" label="Population individuals" />
+<param name="choice" type="select" format="integer" label="Input format">
-<param name="lo_coverage" type="text" value="0" label="Lower bound on total coverage">
+<option value="0" selected="true">gd_snp</option>
-<sanitizer>
+<option value="1">gd_genotype</option>
-<valid initial="string.digits">
+</param>
-<!-- &#37; is the percent (%) character -->
+<when value="0">
-<add value="&#37;" />
+<param name="input" type="data" format="gd_snp" label="SNP dataset" />
-</valid>
+<param name="p1_input" type="data" format="gd_indivs" label="Population individuals" />
-</sanitizer>
+<param name="lo_coverage" type="text" value="0" label="Lower bound on total coverage">
-</param>
+<sanitizer>
-<param name="hi_coverage" type="text" value="1000" label="Upper bound on total coverage">
+<valid initial="string.digits">
-<sanitizer>
+<!-- &#37; is the percent (%) character -->
-<valid initial="string.digits">
+<add value="&#37;" />
-<!-- &#37; is the percent (%) character -->
+</valid>
-<add value="&#37;" />
+</sanitizer>
-</valid>
+</param>
-</sanitizer>
+<param name="hi_coverage" type="text" value="1000" label="Upper bound on total coverage">
-</param>
+<sanitizer>
-<param name="low_ind_cov" type="integer" min="0" value="0" label="Lower bound on individual coverage" />
+<valid initial="string.digits">
-<param name="lo_quality" type="integer" min="0" value="0" label="Lower bound on individual quality values" />
+<!-- &#37; is the percent (%) character -->
+<add value="&#37;" />
+</valid>
+</sanitizer>
+</param>
+<param name="low_ind_cov" type="integer" min="0" value="0" label="Lower bound on individual coverage" />
+<param name="lo_quality" type="integer" min="0" value="0" label="Lower bound on individual quality values" />
+</when>
+<when value="1">
+<param name="input" type="data" format="gd_genotype" label="Genotype dataset" />
+<param name="p1_input" type="data" format="gd_indivs" label="Population individuals" />
+</when>
+</conditional>
+<param name="min_spacing" type="integer" min="0" value="0" label="Minimum spacing between SNPs" />
+<param name="lo_genotypes" type="integer" min="0" value="0" label="Lower bound on the number of defined genotypes" />
 </inputs>
 <outputs>
-<data name="output" format="gd_snp" metadata_source="input" />
+<data name="output" format="input" format_source="input" metadata_source="input" />
 </outputs>
 <tests>
 <test>
 <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" />
 <help>
 **Dataset formats**
-The input datasets are in gd_snp_ and gd_indivs_ formats.
+The input datasets are in gd_snp_, gd_genotype_, and gd_indivs_ formats.
-The output dataset is in gd_snp_ format.  (`Dataset missing?`_)
+The output dataset is in gd_snp_ or gd_genotype_ format.  (`Dataset missing?`_)
 .. _gd_snp: ./static/formatHelp.html#gd_snp
+.. _gd_genotype: ./static/formatHelp.html#gd_genotype
 .. _gd_indivs: ./static/formatHelp.html#gd_indivs
 .. _Dataset missing?: ./static/formatHelp.html
 -----
 **What it does**
-The user specifies that some of the individuals in a gd_snp dataset form a
+For a gd_snp dataset, the user specifies that some of the individuals
-"population", by supplying a list that has been previously created using the
+form a "population", by supplying a list that has been previously created
-Specify Individuals tool.  SNPs are then discarded if their total coverage
+using the Specify Individuals tool.  SNPs are then discarded if their
-for the population is too low or too high, or if their coverage or quality
+total coverage for the population is too low or too high, or if their
-score for any individual in the population is too low.
+coverage or quality score for any individual in the population is too low.
 The upper and lower bounds on total population coverage can be specified
-either as read counts or as percentiles (e.g. "5%", with no decimal places).
+either as read counts or as percentiles (e.g. "5%", with no decimal
-For percentile bounds the SNPs are ranked by read count, so for example, a
+places).  For percentile bounds the SNPs are ranked by read count, so
-lower bound of "10%" means that the least-covered 10% of the SNPs will be
+for example, a lower bound of "10%" means that the least-covered 10%
-discarded, while an upper bound of, say, "80%" will discard all SNPs above
+of the SNPs will be discarded, while an upper bound of, say, "80%" will
-the 80% mark, i.e. the top 20%.  The threshold for the lower bound on
+discard all SNPs above the 80% mark, i.e. the top 20%.  The threshold
-individual coverage can only be specified as a plain read count.
+for the lower bound on individual coverage can only be specified as a
+plain read count.
+For either a gd_snp or gd_genotype dataset, the user can specify a
+minimum number of defined genotypes (i.e., not -1) and/or a minimum
+spacing relative to the reference sequence.  An error is reported if the
+user requests a minimum spacing but no reference sequence is available.
 -----
 **Example**

Mercurial > repos > miller-lab > genome_diversity

comparison filter_gd_snp.xml @ 27:8997f2ca8c7a