Mercurial > repos > miller-lab > genome_diversity
annotate filter_gd_snp.xml @ 28:184d14e4270d
Update to Miller Lab devshed revision 4ede22dd5500
author | Richard Burhans <burhans@bx.psu.edu> |
---|---|
date | Wed, 17 Jul 2013 12:46:46 -0400 |
parents | 8997f2ca8c7a |
children | a631c2f6d913 |
rev | line source |
---|---|
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
1 <tool id="gd_filter_gd_snp" name="Filter SNPs" version="1.2.0"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
2 <description>: Discard some SNPs based on coverage, quality or spacing</description> |
13 | 3 |
4 <command interpreter="python"> | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
5 #import json |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
6 #import base64 |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
7 #import zlib |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
8 #set $ind_names = $input.dataset.metadata.individual_names |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
9 #set $ind_colms = $input.dataset.metadata.individual_columns |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
10 #set $ind_dict = dict(zip($ind_names, $ind_colms)) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
11 #set $ind_json = json.dumps($ind_dict, separators=(',',':')) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
12 #set $ind_comp = zlib.compress($ind_json, 9) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
13 #set $ind_arg = base64.b64encode($ind_comp) |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
14 filter_gd_snp.py '$input' '$output' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
15 #if str($input.dataset.metadata.dbkey) == '?' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
16 '0' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
17 #else |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
18 '$input.dataset.metadata.ref' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
19 #end if |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
20 '$min_spacing' '$lo_genotypes' '$input_type.p1_input' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
21 #if $input_type.choice == '0' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
22 'gd_snp' '$input_type.lo_coverage' '$input_type.hi_coverage' '$input_type.low_ind_cov' '$input_type.lo_quality' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
23 #else if $input_type.choice == '1' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
24 'gd_genotype' '0' '0' '0' '0' |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
25 #end if |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
26 '$ind_arg' |
13 | 27 </command> |
28 | |
29 <inputs> | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
30 <conditional name="input_type"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
31 <param name="choice" type="select" format="integer" label="Input format"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
32 <option value="0" selected="true">gd_snp</option> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
33 <option value="1">gd_genotype</option> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
34 </param> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
35 <when value="0"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
36 <param name="input" type="data" format="gd_snp" label="SNP dataset" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
37 <param name="p1_input" type="data" format="gd_indivs" label="Population individuals" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
38 <param name="lo_coverage" type="text" value="0" label="Lower bound on total coverage"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
39 <sanitizer> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
40 <valid initial="string.digits"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
41 <!-- % is the percent (%) character --> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
42 <add value="%" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
43 </valid> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
44 </sanitizer> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
45 </param> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
46 <param name="hi_coverage" type="text" value="1000" label="Upper bound on total coverage"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
47 <sanitizer> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
48 <valid initial="string.digits"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
49 <!-- % is the percent (%) character --> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
50 <add value="%" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
51 </valid> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
52 </sanitizer> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
53 </param> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
54 <param name="low_ind_cov" type="integer" min="0" value="0" label="Lower bound on individual coverage" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
55 <param name="lo_quality" type="integer" min="0" value="0" label="Lower bound on individual quality values" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
56 </when> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
57 <when value="1"> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
58 <param name="input" type="data" format="gd_genotype" label="Genotype dataset" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
59 <param name="p1_input" type="data" format="gd_indivs" label="Population individuals" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
60 </when> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
61 </conditional> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
62 <param name="min_spacing" type="integer" min="0" value="0" label="Minimum spacing between SNPs" /> |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
63 <param name="lo_genotypes" type="integer" min="0" value="0" label="Lower bound on the number of defined genotypes" /> |
13 | 64 </inputs> |
65 | |
66 <outputs> | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
67 <data name="output" format="input" format_source="input" metadata_source="input" /> |
13 | 68 </outputs> |
69 | |
70 <tests> | |
71 <test> | |
72 <param name="input" value="test_in/sample.gd_snp" ftype="gd_snp" /> | |
73 <param name="p1_input" value="test_in/a.gd_indivs" ftype="gd_indivs" /> | |
74 <param name="lo_coverage" value="0" /> | |
75 <param name="hi_coverage" value="1000" /> | |
76 <param name="low_ind_cov" value="3" /> | |
77 <param name="lo_quality" value="30" /> | |
78 <output name="output" file="test_out/modify_snp_table/modify.gd_snp" /> | |
79 </test> | |
80 </tests> | |
81 | |
82 <help> | |
83 | |
84 **Dataset formats** | |
85 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
86 The input datasets are in gd_snp_, gd_genotype_, and gd_indivs_ formats. |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
87 The output dataset is in gd_snp_ or gd_genotype_ format. (`Dataset missing?`_) |
13 | 88 |
89 .. _gd_snp: ./static/formatHelp.html#gd_snp | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
90 .. _gd_genotype: ./static/formatHelp.html#gd_genotype |
13 | 91 .. _gd_indivs: ./static/formatHelp.html#gd_indivs |
92 .. _Dataset missing?: ./static/formatHelp.html | |
93 | |
94 ----- | |
95 | |
96 **What it does** | |
97 | |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
98 For a gd_snp dataset, the user specifies that some of the individuals |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
99 form a "population", by supplying a list that has been previously created |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
100 using the Specify Individuals tool. SNPs are then discarded if their |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
101 total coverage for the population is too low or too high, or if their |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
102 coverage or quality score for any individual in the population is too low. |
13 | 103 |
22
95a05c1ef5d5
update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
18
diff
changeset
|
104 The upper and lower bounds on total population coverage can be specified |
27
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
105 either as read counts or as percentiles (e.g. "5%", with no decimal |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
106 places). For percentile bounds the SNPs are ranked by read count, so |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
107 for example, a lower bound of "10%" means that the least-covered 10% |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
108 of the SNPs will be discarded, while an upper bound of, say, "80%" will |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
109 discard all SNPs above the 80% mark, i.e. the top 20%. The threshold |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
110 for the lower bound on individual coverage can only be specified as a |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
111 plain read count. |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
112 |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
113 For either a gd_snp or gd_genotype dataset, the user can specify a |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
114 minimum number of defined genotypes (i.e., not -1) and/or a minimum |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
115 spacing relative to the reference sequence. An error is reported if the |
8997f2ca8c7a
Update to Miller Lab devshed revision bae0d3306d3b
Richard Burhans <burhans@bx.psu.edu>
parents:
22
diff
changeset
|
116 user requests a minimum spacing but no reference sequence is available. |
22
95a05c1ef5d5
update to devshed revision aaece207bd01
Richard Burhans <burhans@bx.psu.edu>
parents:
18
diff
changeset
|
117 |
13 | 118 ----- |
119 | |
120 **Example** | |
121 | |
122 - input gd_snp:: | |
123 | |
124 Contig161_chr1_4641264_4641879 115 C T 73.5 chr1 4641382 C 6 0 2 45 8 0 2 51 15 0 2 72 5 0 2 42 6 0 2 45 10 0 2 57 Y 54 0.323 0 | |
125 Contig48_chr1_10150253_10151311 11 A G 94.3 chr1 10150264 A 1 0 2 30 1 0 2 30 1 0 2 30 3 0 2 36 1 0 2 30 1 0 2 30 Y 22 +99. 0 | |
126 Contig20_chr1_21313469_21313570 66 C T 54.0 chr1 21313534 C 4 0 2 39 4 0 2 39 5 0 2 42 4 0 2 39 4 0 2 39 5 0 2 42 N 1 +99. 0 | |
127 etc. | |
128 | |
129 - input individuals:: | |
130 | |
131 9 PB1 | |
132 13 PB2 | |
133 17 PB3 | |
134 | |
135 - output when the lower bound on individual coverage is "3":: | |
136 | |
137 Contig161_chr1_4641264_4641879 115 C T 73.5 chr1 4641382 C 6 0 2 45 8 0 2 51 15 0 2 72 5 0 2 42 6 0 2 45 10 0 2 57 Y 54 0.323 0 | |
138 Contig20_chr1_21313469_21313570 66 C T 54.0 chr1 21313534 C 4 0 2 39 4 0 2 39 5 0 2 42 4 0 2 39 4 0 2 39 5 0 2 42 N 1 +99. 0 | |
139 etc. | |
140 | |
141 </help> | |
142 </tool> |