Mercurial > repos > miller-lab > genome_diversity
comparison filter_gd_snp.xml @ 27:8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Mon, 15 Jul 2013 10:47:35 -0400 |
parents | 95a05c1ef5d5 |
children | a631c2f6d913 |
comparison
equal
deleted
inserted
replaced
26:91e835060ad2 | 27:8997f2ca8c7a |
---|---|
1 <tool id="gd_filter_gd_snp" name="Filter SNPs" version="1.1.0"> | 1 <tool id="gd_filter_gd_snp" name="Filter SNPs" version="1.2.0"> |
2 <description>: Discard some SNPs based on coverage or quality</description> | 2 <description>: Discard some SNPs based on coverage, quality or spacing</description> |
3 | 3 |
4 <command interpreter="python"> | 4 <command interpreter="python"> |
5 filter_gd_snp.py "$input" "$p1_input" "$output" "$lo_coverage" "$hi_coverage" "$low_ind_cov" "$lo_quality" | 5 #import json |
6 #for $individual, $individual_col in zip($input.dataset.metadata.individual_names, $input.dataset.metadata.individual_columns) | 6 #import base64 |
7 #set $arg = '%s:%s' % ($individual_col, $individual) | 7 #import zlib |
8 "$arg" | 8 #set $ind_names = $input.dataset.metadata.individual_names |
9 #end for | 9 #set $ind_colms = $input.dataset.metadata.individual_columns |
10 #set $ind_dict = dict(zip($ind_names, $ind_colms)) | |
11 #set $ind_json = json.dumps($ind_dict, separators=(',',':')) | |
12 #set $ind_comp = zlib.compress($ind_json, 9) | |
13 #set $ind_arg = base64.b64encode($ind_comp) | |
14 filter_gd_snp.py '$input' '$output' | |
15 #if str($input.dataset.metadata.dbkey) == '?' | |
16 '0' | |
17 #else | |
18 '$input.dataset.metadata.ref' | |
19 #end if | |
20 '$min_spacing' '$lo_genotypes' '$input_type.p1_input' | |
21 #if $input_type.choice == '0' | |
22 'gd_snp' '$input_type.lo_coverage' '$input_type.hi_coverage' '$input_type.low_ind_cov' '$input_type.lo_quality' | |
23 #else if $input_type.choice == '1' | |
24 'gd_genotype' '0' '0' '0' '0' | |
25 #end if | |
26 '$ind_arg' | |
10 </command> | 27 </command> |
11 | 28 |
12 <inputs> | 29 <inputs> |
13 <param name="input" type="data" format="gd_snp" label="SNP dataset" /> | 30 <conditional name="input_type"> |
14 <param name="p1_input" type="data" format="gd_indivs" label="Population individuals" /> | 31 <param name="choice" type="select" format="integer" label="Input format"> |
15 <param name="lo_coverage" type="text" value="0" label="Lower bound on total coverage"> | 32 <option value="0" selected="true">gd_snp</option> |
16 <sanitizer> | 33 <option value="1">gd_genotype</option> |
17 <valid initial="string.digits"> | 34 </param> |
18 <!-- % is the percent (%) character --> | 35 <when value="0"> |
19 <add value="%" /> | 36 <param name="input" type="data" format="gd_snp" label="SNP dataset" /> |
20 </valid> | 37 <param name="p1_input" type="data" format="gd_indivs" label="Population individuals" /> |
21 </sanitizer> | 38 <param name="lo_coverage" type="text" value="0" label="Lower bound on total coverage"> |
22 </param> | 39 <sanitizer> |
23 <param name="hi_coverage" type="text" value="1000" label="Upper bound on total coverage"> | 40 <valid initial="string.digits"> |
24 <sanitizer> | 41 <!-- % is the percent (%) character --> |
25 <valid initial="string.digits"> | 42 <add value="%" /> |
26 <!-- % is the percent (%) character --> | 43 </valid> |
27 <add value="%" /> | 44 </sanitizer> |
28 </valid> | 45 </param> |
29 </sanitizer> | 46 <param name="hi_coverage" type="text" value="1000" label="Upper bound on total coverage"> |
30 </param> | 47 <sanitizer> |
31 <param name="low_ind_cov" type="integer" min="0" value="0" label="Lower bound on individual coverage" /> | 48 <valid initial="string.digits"> |
32 <param name="lo_quality" type="integer" min="0" value="0" label="Lower bound on individual quality values" /> | 49 <!-- % is the percent (%) character --> |
50 <add value="%" /> | |
51 </valid> | |
52 </sanitizer> | |
53 </param> | |
54 <param name="low_ind_cov" type="integer" min="0" value="0" label="Lower bound on individual coverage" /> | |
55 <param name="lo_quality" type="integer" min="0" value="0" label="Lower bound on individual quality values" /> | |
56 </when> | |
57 <when value="1"> | |
58 <param name="input" type="data" format="gd_genotype" label="Genotype dataset" /> | |
59 <param name="p1_input" type="data" format="gd_indivs" label="Population individuals" /> | |
60 </when> | |
61 </conditional> | |
62 <param name="min_spacing" type="integer" min="0" value="0" label="Minimum spacing between SNPs" /> | |
63 <param name="lo_genotypes" type="integer" min="0" value="0" label="Lower bound on the number of defined genotypes" /> | |
33 </inputs> | 64 </inputs> |
34 | 65 |
35 <outputs> | 66 <outputs> |
36 <data name="output" format="gd_snp" metadata_source="input" /> | 67 <data name="output" format="input" format_source="input" metadata_source="input" /> |
37 </outputs> | 68 </outputs> |
38 | 69 |
39 <tests> | 70 <tests> |
40 <test> | 71 <test> |
41 <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" /> | 72 <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" /> |
50 | 81 |
51 <help> | 82 <help> |
52 | 83 |
53 **Dataset formats** | 84 **Dataset formats** |
54 | 85 |
55 The input datasets are in gd_snp_ and gd_indivs_ formats. | 86 The input datasets are in gd_snp_, gd_genotype_, and gd_indivs_ formats. |
56 The output dataset is in gd_snp_ format. (`Dataset missing?`_) | 87 The output dataset is in gd_snp_ or gd_genotype_ format. (`Dataset missing?`_) |
57 | 88 |
58 .. _gd_snp: ./static/formatHelp.html#gd_snp | 89 .. _gd_snp: ./static/formatHelp.html#gd_snp |
90 .. _gd_genotype: ./static/formatHelp.html#gd_genotype | |
59 .. _gd_indivs: ./static/formatHelp.html#gd_indivs | 91 .. _gd_indivs: ./static/formatHelp.html#gd_indivs |
60 .. _Dataset missing?: ./static/formatHelp.html | 92 .. _Dataset missing?: ./static/formatHelp.html |
61 | 93 |
62 ----- | 94 ----- |
63 | 95 |
64 **What it does** | 96 **What it does** |
65 | 97 |
66 The user specifies that some of the individuals in a gd_snp dataset form a | 98 For a gd_snp dataset, the user specifies that some of the individuals |
67 "population", by supplying a list that has been previously created using the | 99 form a "population", by supplying a list that has been previously created |
68 Specify Individuals tool. SNPs are then discarded if their total coverage | 100 using the Specify Individuals tool. SNPs are then discarded if their |
69 for the population is too low or too high, or if their coverage or quality | 101 total coverage for the population is too low or too high, or if their |
70 score for any individual in the population is too low. | 102 coverage or quality score for any individual in the population is too low. |
71 | 103 |
72 The upper and lower bounds on total population coverage can be specified | 104 The upper and lower bounds on total population coverage can be specified |
73 either as read counts or as percentiles (e.g. "5%", with no decimal places). | 105 either as read counts or as percentiles (e.g. "5%", with no decimal |
74 For percentile bounds the SNPs are ranked by read count, so for example, a | 106 places). For percentile bounds the SNPs are ranked by read count, so |
75 lower bound of "10%" means that the least-covered 10% of the SNPs will be | 107 for example, a lower bound of "10%" means that the least-covered 10% |
76 discarded, while an upper bound of, say, "80%" will discard all SNPs above | 108 of the SNPs will be discarded, while an upper bound of, say, "80%" will |
77 the 80% mark, i.e. the top 20%. The threshold for the lower bound on | 109 discard all SNPs above the 80% mark, i.e. the top 20%. The threshold |
78 individual coverage can only be specified as a plain read count. | 110 for the lower bound on individual coverage can only be specified as a |
111 plain read count. | |
112 | |
113 For either a gd_snp or gd_genotype dataset, the user can specify a | |
114 minimum number of defined genotypes (i.e., not -1) and/or a minimum | |
115 spacing relative to the reference sequence. An error is reported if the | |
116 user requests a minimum spacing but no reference sequence is available. | |
79 | 117 |
80 ----- | 118 ----- |
81 | 119 |
82 **Example** | 120 **Example** |
83 | 121 |