comparison filter_gd_snp.xml @ 27:8997f2ca8c7a

Update to Miller Lab devshed revision bae0d3306d3b
author Richard Burhans <burhans@bx.psu.edu>
date Mon, 15 Jul 2013 10:47:35 -0400
parents 95a05c1ef5d5
children a631c2f6d913
comparison
equal deleted inserted replaced
26:91e835060ad2 27:8997f2ca8c7a
1 <tool id="gd_filter_gd_snp" name="Filter SNPs" version="1.1.0"> 1 <tool id="gd_filter_gd_snp" name="Filter SNPs" version="1.2.0">
2 <description>: Discard some SNPs based on coverage or quality</description> 2 <description>: Discard some SNPs based on coverage, quality or spacing</description>
3 3
4 <command interpreter="python"> 4 <command interpreter="python">
5 filter_gd_snp.py "$input" "$p1_input" "$output" "$lo_coverage" "$hi_coverage" "$low_ind_cov" "$lo_quality" 5 #import json
6 #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) 6 #import base64
7 #set $arg = '%s:%s' % ($individual_col, $individual) 7 #import zlib
8 "$arg" 8 #set $ind_names = $input.dataset.metadata.individual_names
9 #end for 9 #set $ind_colms = $input.dataset.metadata.individual_columns
10 #set $ind_dict = dict(zip($ind_names, $ind_colms))
11 #set $ind_json = json.dumps($ind_dict, separators=(',',':'))
12 #set $ind_comp = zlib.compress($ind_json, 9)
13 #set $ind_arg = base64.b64encode($ind_comp)
14 filter_gd_snp.py '$input' '$output'
15 #if str($input.dataset.metadata.dbkey) == '?'
16 '0'
17 #else
18 '$input.dataset.metadata.ref'
19 #end if
20 '$min_spacing' '$lo_genotypes' '$input_type.p1_input'
21 #if $input_type.choice == '0'
22 'gd_snp' '$input_type.lo_coverage' '$input_type.hi_coverage' '$input_type.low_ind_cov' '$input_type.lo_quality'
23 #else if $input_type.choice == '1'
24 'gd_genotype' '0' '0' '0' '0'
25 #end if
26 '$ind_arg'
10 </command> 27 </command>
11 28
12 <inputs> 29 <inputs>
13 <param name="input" type="data" format="gd_snp" label="SNP dataset" /> 30 <conditional name="input_type">
14 <param name="p1_input" type="data" format="gd_indivs" label="Population individuals" /> 31 <param name="choice" type="select" format="integer" label="Input format">
15 <param name="lo_coverage" type="text" value="0" label="Lower bound on total coverage"> 32 <option value="0" selected="true">gd_snp</option>
16 <sanitizer> 33 <option value="1">gd_genotype</option>
17 <valid initial="string.digits"> 34 </param>
18 <!-- &#37; is the percent (%) character --> 35 <when value="0">
19 <add value="&#37;" /> 36 <param name="input" type="data" format="gd_snp" label="SNP dataset" />
20 </valid> 37 <param name="p1_input" type="data" format="gd_indivs" label="Population individuals" />
21 </sanitizer> 38 <param name="lo_coverage" type="text" value="0" label="Lower bound on total coverage">
22 </param> 39 <sanitizer>
23 <param name="hi_coverage" type="text" value="1000" label="Upper bound on total coverage"> 40 <valid initial="string.digits">
24 <sanitizer> 41 <!-- &#37; is the percent (%) character -->
25 <valid initial="string.digits"> 42 <add value="&#37;" />
26 <!-- &#37; is the percent (%) character --> 43 </valid>
27 <add value="&#37;" /> 44 </sanitizer>
28 </valid> 45 </param>
29 </sanitizer> 46 <param name="hi_coverage" type="text" value="1000" label="Upper bound on total coverage">
30 </param> 47 <sanitizer>
31 <param name="low_ind_cov" type="integer" min="0" value="0" label="Lower bound on individual coverage" /> 48 <valid initial="string.digits">
32 <param name="lo_quality" type="integer" min="0" value="0" label="Lower bound on individual quality values" /> 49 <!-- &#37; is the percent (%) character -->
50 <add value="&#37;" />
51 </valid>
52 </sanitizer>
53 </param>
54 <param name="low_ind_cov" type="integer" min="0" value="0" label="Lower bound on individual coverage" />
55 <param name="lo_quality" type="integer" min="0" value="0" label="Lower bound on individual quality values" />
56 </when>
57 <when value="1">
58 <param name="input" type="data" format="gd_genotype" label="Genotype dataset" />
59 <param name="p1_input" type="data" format="gd_indivs" label="Population individuals" />
60 </when>
61 </conditional>
62 <param name="min_spacing" type="integer" min="0" value="0" label="Minimum spacing between SNPs" />
63 <param name="lo_genotypes" type="integer" min="0" value="0" label="Lower bound on the number of defined genotypes" />
33 </inputs> 64 </inputs>
34 65
35 <outputs> 66 <outputs>
36 <data name="output" format="gd_snp" metadata_source="input" /> 67 <data name="output" format="input" format_source="input" metadata_source="input" />
37 </outputs> 68 </outputs>
38 69
39 <tests> 70 <tests>
40 <test> 71 <test>
41 <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" /> 72 <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" />
50 81
51 <help> 82 <help>
52 83
53 **Dataset formats** 84 **Dataset formats**
54 85
55 The input datasets are in gd_snp_ and gd_indivs_ formats. 86 The input datasets are in gd_snp_, gd_genotype_, and gd_indivs_ formats.
56 The output dataset is in gd_snp_ format. (`Dataset missing?`_) 87 The output dataset is in gd_snp_ or gd_genotype_ format. (`Dataset missing?`_)
57 88
58 .. _gd_snp: ./static/formatHelp.html#gd_snp 89 .. _gd_snp: ./static/formatHelp.html#gd_snp
90 .. _gd_genotype: ./static/formatHelp.html#gd_genotype
59 .. _gd_indivs: ./static/formatHelp.html#gd_indivs 91 .. _gd_indivs: ./static/formatHelp.html#gd_indivs
60 .. _Dataset missing?: ./static/formatHelp.html 92 .. _Dataset missing?: ./static/formatHelp.html
61 93
62 ----- 94 -----
63 95
64 **What it does** 96 **What it does**
65 97
66 The user specifies that some of the individuals in a gd_snp dataset form a 98 For a gd_snp dataset, the user specifies that some of the individuals
67 "population", by supplying a list that has been previously created using the 99 form a "population", by supplying a list that has been previously created
68 Specify Individuals tool. SNPs are then discarded if their total coverage 100 using the Specify Individuals tool. SNPs are then discarded if their
69 for the population is too low or too high, or if their coverage or quality 101 total coverage for the population is too low or too high, or if their
70 score for any individual in the population is too low. 102 coverage or quality score for any individual in the population is too low.
71 103
72 The upper and lower bounds on total population coverage can be specified 104 The upper and lower bounds on total population coverage can be specified
73 either as read counts or as percentiles (e.g. "5%", with no decimal places). 105 either as read counts or as percentiles (e.g. "5%", with no decimal
74 For percentile bounds the SNPs are ranked by read count, so for example, a 106 places). For percentile bounds the SNPs are ranked by read count, so
75 lower bound of "10%" means that the least-covered 10% of the SNPs will be 107 for example, a lower bound of "10%" means that the least-covered 10%
76 discarded, while an upper bound of, say, "80%" will discard all SNPs above 108 of the SNPs will be discarded, while an upper bound of, say, "80%" will
77 the 80% mark, i.e. the top 20%. The threshold for the lower bound on 109 discard all SNPs above the 80% mark, i.e. the top 20%. The threshold
78 individual coverage can only be specified as a plain read count. 110 for the lower bound on individual coverage can only be specified as a
111 plain read count.
112
113 For either a gd_snp or gd_genotype dataset, the user can specify a
114 minimum number of defined genotypes (i.e., not -1) and/or a minimum
115 spacing relative to the reference sequence. An error is reported if the
116 user requests a minimum spacing but no reference sequence is available.
79 117
80 ----- 118 -----
81 119
82 **Example** 120 **Example**
83 121