0
|
1 <tool id="rgClean1" name="Clean genotypes:">
|
|
2 <description>filter markers, subjects</description>
|
|
3
|
|
4 <command interpreter="python">
|
|
5 rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind'
|
|
6 '$geno' '$hwe' '$maf' '$mef' '$mei' '$out_file1' '$out_file1.files_path'
|
|
7 '$relfilter' '$afffilter' '$sexfilter' '$fixaff'
|
|
8 </command>
|
|
9
|
|
10 <inputs>
|
|
11 <param name="input_file" type="data" label="RGenetics genotype library file in compressed Plink format"
|
|
12 size="120" format="pbed" />
|
|
13 <param name="title" type="text" size="80" label="Descriptive title for cleaned genotype file" value="Cleaned_data"/>
|
|
14 <param name="geno" type="text" label="Maximum Missing Fraction: Markers" value="0.05" />
|
|
15 <param name="mind" type="text" value="0.1" label="Maximum Missing Fraction: Subjects"/>
|
|
16 <param name="mef" type="text" label="Maximum Mendel Error Rate: Family" value="0.05"/>
|
|
17 <param name="mei" type="text" label="Maximum Mendel Error Rate: Marker" value="0.05"/>
|
|
18 <param name="hwe" type="text" value="0" label="Smallest HWE p value (set to 0 for all)" />
|
|
19 <param name="maf" type="text" value="0.01"
|
|
20 label="Smallest Minor Allele Frequency (set to 0 for all)"/>
|
|
21 <param name='relfilter' label = "Filter on pedigree relatedness" type="select"
|
|
22 optional="false" size="132"
|
|
23 help="Optionally remove related subjects if pedigree identifies founders and their offspring">
|
|
24 <option value="all" selected='true'>No filter on relatedness</option>
|
|
25 <option value="fo" >Keep Founders only (pedigree m/f ID = "0")</option>
|
|
26 <option value="oo" >Keep Offspring only (one randomly chosen if >1 sibs in family)</option>
|
|
27 </param>
|
|
28 <param name='afffilter' label = "Filter on affection status" type="select"
|
|
29 optional="false" size="132"
|
|
30 help="Optionally remove affected or non affected subjects">
|
|
31 <option value="allaff" selected='true'>No filter on affection status</option>
|
|
32 <option value="affonly" >Keep Controls only (affection='1')</option>
|
|
33 <option value="unaffonly" >Keep Cases only (affection='2')</option>
|
|
34 </param>
|
|
35 <param name='sexfilter' label = "Filter on gender" type="select"
|
|
36 optional="false" size="132"
|
|
37 help="Optionally remove all male or all female subjects">
|
|
38 <option value="allsex" selected='true'>No filter on gender status</option>
|
|
39 <option value="msex" >Keep Males only (pedigree gender='1')</option>
|
|
40 <option value="fsex" >Keep Females only (pedigree gender='2')</option>
|
|
41 </param>
|
|
42 <param name="fixaff" type="text" value="0"
|
|
43 label = "Change ALL subjects affection status to (0=no change,1=unaff,2=aff)"
|
|
44 help="Use this option to switch the affection status to a new value for all output subjects" />
|
|
45 </inputs>
|
|
46
|
|
47 <outputs>
|
|
48 <data format="pbed" name="out_file1" metadata_source="input_file" label="${title}_rgClean.pbed" />
|
|
49 </outputs>
|
|
50
|
|
51 <tests>
|
|
52 <test>
|
|
53 <param name='input_file' value='tinywga' ftype='pbed' >
|
|
54 <metadata name='base_name' value='tinywga' />
|
|
55 <composite_data value='tinywga.bim' />
|
|
56 <composite_data value='tinywga.bed' />
|
|
57 <composite_data value='tinywga.fam' />
|
|
58 <edit_attributes type='name' value='tinywga' />
|
|
59 </param>
|
|
60 <param name='title' value='rgCleantest1' />
|
|
61 <param name="geno" value="1" />
|
|
62 <param name="mind" value="1" />
|
|
63 <param name="mef" value="0" />
|
|
64 <param name="mei" value="0" />
|
|
65 <param name="hwe" value="0" />
|
|
66 <param name="maf" value="0" />
|
|
67 <param name="relfilter" value="all" />
|
|
68 <param name="afffilter" value="allaff" />
|
|
69 <param name="sexfilter" value="allsex" />
|
|
70 <param name="fixaff" value="0" />
|
|
71 <output name='out_file1' file='rgtestouts/rgClean/rgCleantest1.pbed' compare="diff" lines_diff="25" >
|
|
72 <extra_files type="file" name='rgCleantest1.bim' value="rgtestouts/rgClean/rgCleantest1.bim" compare="diff" />
|
|
73 <extra_files type="file" name='rgCleantest1.fam' value="rgtestouts/rgClean/rgCleantest1.fam" compare="diff" />
|
|
74 <extra_files type="file" name='rgCleantest1.bed' value="rgtestouts/rgClean/rgCleantest1.bed" compare="diff" />
|
|
75 </output>
|
|
76 </test>
|
|
77 </tests>
|
|
78 <help>
|
|
79
|
|
80 .. class:: infomark
|
|
81
|
|
82 **Syntax**
|
|
83
|
|
84 - **Genotype data** is the input genotype file chosen from your current history
|
|
85 - **Descriptive title** is the name to use for the filtered output file
|
|
86 - **Missfrac threshold: subjects** is the threshold for missingness by subject. Subjects with more than this fraction missing will be excluded from the import
|
|
87 - **Missfrac threshold: markers** is the threshold for missingness by marker. Markers with more than this fraction missing will be excluded from the import
|
|
88 - **MaxMendel Individuals** Mendel error fraction above which to exclude subjects with more than the specified fraction of mendelian errors in transmission (for family data only)
|
|
89 - **MaxMendel Families** Mendel error fraction above which to exclude families with more than the specified fraction of mendelian errors in transmission (for family data only)
|
|
90 - **HWE** is the threshold for HWE test p values below which the marker will not be imported. Set this to -1 and all markers will be imported regardless of HWE p value
|
|
91 - **MAF** is the threshold for minor allele frequency - SNPs with lower MAF will be excluded
|
|
92 - **Filters** for founders/offspring or affected/unaffected or males/females are optionally available if needed
|
|
93 - **Change Affection** is only needed if you want to change the affection status for creating new analysis datasets
|
|
94
|
|
95 -----
|
|
96
|
|
97 **Attribution**
|
|
98
|
|
99 This tool relies on the work of many people. It uses Plink http://pngu.mgh.harvard.edu/~purcell/plink/,
|
|
100 and the R http://cran.r-project.org/ and
|
|
101 Bioconductor http://www.bioconductor.org/ projects.
|
|
102 respectively.
|
|
103
|
|
104 In particular, http://pngu.mgh.harvard.edu/~purcell/plink/
|
|
105 has excellent documentation describing the parameters you can set here.
|
|
106
|
|
107 This implementation is a Galaxy tool wrapper around these third party applications.
|
|
108 It was originally designed and written for family based data from the CAMP Illumina run of 2007 by
|
|
109 ross lazarus (ross.lazarus@gmail.com) and incorporated into the rgenetics toolkit.
|
|
110
|
|
111 Rgenetics merely exposes them, wrapping Plink so you can use it in Galaxy.
|
|
112
|
|
113 -----
|
|
114
|
|
115 **Summary**
|
|
116
|
|
117 Reliable statistical inference depends on reliable data. Poor quality samples and markers
|
|
118 may add more noise than signal, decreasing statistical power. Removing the worst of them
|
|
119 can be done by setting thresholds for some of the commonly used technical quality measures
|
|
120 for genotype data. Of course discordant replicate calls are also very informative but are not
|
|
121 in scope here.
|
|
122
|
|
123 Marker cleaning: Filters are available to remove markers below a specific minor allele
|
|
124 frequency, beyond a Hardy Wienberg threshold, below a minor allele frequency threshold,
|
|
125 or above a threshold for missingness. If family data are available, thresholds for Mendelian
|
|
126 error can be set.
|
|
127
|
|
128 Subject cleaning: Filters are available to remove subjects with many missing calls. Subjects and markers for family data can be filtered by proportions
|
|
129 of Mendelian errors in observed transmission. Use the QC reporting tool to
|
|
130 generate a comprehensive series of reports for quality control.
|
|
131
|
|
132 Note that ancestry and cryptic relatedness should also be checked using the relevant tools.
|
|
133
|
|
134 -----
|
|
135
|
|
136 .. class:: infomark
|
|
137
|
|
138 **Tip**
|
|
139
|
|
140 You can check that you got what you asked for by running the QC tool to ensure that the distributions
|
|
141 are truncated the way you expect. Note that you do not expect that the thresholds will be exactly
|
|
142 what you set - some bad assays and subjects are out in multiple QC measures, so you sometimes have
|
|
143 more samples or markers than you exactly set for each threshold. Finally, the ordering of
|
|
144 operations matters and Plink is somewhat restrictive about what it will do on each pass
|
|
145 of the data. At least it's fixed.
|
|
146
|
|
147 -----
|
|
148
|
|
149 This Galaxy tool was written by Ross Lazarus for the Rgenetics project
|
|
150 It uses Plink for most calculations - for full Plink attribution, source code and documentation,
|
|
151 please see http://pngu.mgh.harvard.edu/~purcell/plink/ plus some custom python code
|
|
152
|
|
153 </help>
|
|
154 </tool>
|