Mercurial > repos > xuebing > sharplabtool
comparison tools/rgenetics/rgClean.xml @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 <tool id="rgClean1" name="Clean genotypes:"> | |
2 <description>filter markers, subjects</description> | |
3 | |
4 <command interpreter="python"> | |
5 rgClean.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title' '$mind' | |
6 '$geno' '$hwe' '$maf' '$mef' '$mei' '$out_file1' '$out_file1.files_path' | |
7 '$relfilter' '$afffilter' '$sexfilter' '$fixaff' | |
8 </command> | |
9 | |
10 <inputs> | |
11 <param name="input_file" type="data" label="RGenetics genotype library file in compressed Plink format" | |
12 size="120" format="pbed" /> | |
13 <param name="title" type="text" size="80" label="Descriptive title for cleaned genotype file" value="Cleaned_data"/> | |
14 <param name="geno" type="text" label="Maximum Missing Fraction: Markers" value="0.05" /> | |
15 <param name="mind" type="text" value="0.1" label="Maximum Missing Fraction: Subjects"/> | |
16 <param name="mef" type="text" label="Maximum Mendel Error Rate: Family" value="0.05"/> | |
17 <param name="mei" type="text" label="Maximum Mendel Error Rate: Marker" value="0.05"/> | |
18 <param name="hwe" type="text" value="0" label="Smallest HWE p value (set to 0 for all)" /> | |
19 <param name="maf" type="text" value="0.01" | |
20 label="Smallest Minor Allele Frequency (set to 0 for all)"/> | |
21 <param name='relfilter' label = "Filter on pedigree relatedness" type="select" | |
22 optional="false" size="132" | |
23 help="Optionally remove related subjects if pedigree identifies founders and their offspring"> | |
24 <option value="all" selected='true'>No filter on relatedness</option> | |
25 <option value="fo" >Keep Founders only (pedigree m/f ID = "0")</option> | |
26 <option value="oo" >Keep Offspring only (one randomly chosen if >1 sibs in family)</option> | |
27 </param> | |
28 <param name='afffilter' label = "Filter on affection status" type="select" | |
29 optional="false" size="132" | |
30 help="Optionally remove affected or non affected subjects"> | |
31 <option value="allaff" selected='true'>No filter on affection status</option> | |
32 <option value="affonly" >Keep Controls only (affection='1')</option> | |
33 <option value="unaffonly" >Keep Cases only (affection='2')</option> | |
34 </param> | |
35 <param name='sexfilter' label = "Filter on gender" type="select" | |
36 optional="false" size="132" | |
37 help="Optionally remove all male or all female subjects"> | |
38 <option value="allsex" selected='true'>No filter on gender status</option> | |
39 <option value="msex" >Keep Males only (pedigree gender='1')</option> | |
40 <option value="fsex" >Keep Females only (pedigree gender='2')</option> | |
41 </param> | |
42 <param name="fixaff" type="text" value="0" | |
43 label = "Change ALL subjects affection status to (0=no change,1=unaff,2=aff)" | |
44 help="Use this option to switch the affection status to a new value for all output subjects" /> | |
45 </inputs> | |
46 | |
47 <outputs> | |
48 <data format="pbed" name="out_file1" metadata_source="input_file" label="${title}_rgClean.pbed" /> | |
49 </outputs> | |
50 | |
51 <tests> | |
52 <test> | |
53 <param name='input_file' value='tinywga' ftype='pbed' > | |
54 <metadata name='base_name' value='tinywga' /> | |
55 <composite_data value='tinywga.bim' /> | |
56 <composite_data value='tinywga.bed' /> | |
57 <composite_data value='tinywga.fam' /> | |
58 <edit_attributes type='name' value='tinywga' /> | |
59 </param> | |
60 <param name='title' value='rgCleantest1' /> | |
61 <param name="geno" value="1" /> | |
62 <param name="mind" value="1" /> | |
63 <param name="mef" value="0" /> | |
64 <param name="mei" value="0" /> | |
65 <param name="hwe" value="0" /> | |
66 <param name="maf" value="0" /> | |
67 <param name="relfilter" value="all" /> | |
68 <param name="afffilter" value="allaff" /> | |
69 <param name="sexfilter" value="allsex" /> | |
70 <param name="fixaff" value="0" /> | |
71 <output name='out_file1' file='rgtestouts/rgClean/rgCleantest1.pbed' compare="diff" lines_diff="25" > | |
72 <extra_files type="file" name='rgCleantest1.bim' value="rgtestouts/rgClean/rgCleantest1.bim" compare="diff" /> | |
73 <extra_files type="file" name='rgCleantest1.fam' value="rgtestouts/rgClean/rgCleantest1.fam" compare="diff" /> | |
74 <extra_files type="file" name='rgCleantest1.bed' value="rgtestouts/rgClean/rgCleantest1.bed" compare="diff" /> | |
75 </output> | |
76 </test> | |
77 </tests> | |
78 <help> | |
79 | |
80 .. class:: infomark | |
81 | |
82 **Syntax** | |
83 | |
84 - **Genotype data** is the input genotype file chosen from your current history | |
85 - **Descriptive title** is the name to use for the filtered output file | |
86 - **Missfrac threshold: subjects** is the threshold for missingness by subject. Subjects with more than this fraction missing will be excluded from the import | |
87 - **Missfrac threshold: markers** is the threshold for missingness by marker. Markers with more than this fraction missing will be excluded from the import | |
88 - **MaxMendel Individuals** Mendel error fraction above which to exclude subjects with more than the specified fraction of mendelian errors in transmission (for family data only) | |
89 - **MaxMendel Families** Mendel error fraction above which to exclude families with more than the specified fraction of mendelian errors in transmission (for family data only) | |
90 - **HWE** is the threshold for HWE test p values below which the marker will not be imported. Set this to -1 and all markers will be imported regardless of HWE p value | |
91 - **MAF** is the threshold for minor allele frequency - SNPs with lower MAF will be excluded | |
92 - **Filters** for founders/offspring or affected/unaffected or males/females are optionally available if needed | |
93 - **Change Affection** is only needed if you want to change the affection status for creating new analysis datasets | |
94 | |
95 ----- | |
96 | |
97 **Attribution** | |
98 | |
99 This tool relies on the work of many people. It uses Plink http://pngu.mgh.harvard.edu/~purcell/plink/, | |
100 and the R http://cran.r-project.org/ and | |
101 Bioconductor http://www.bioconductor.org/ projects. | |
102 respectively. | |
103 | |
104 In particular, http://pngu.mgh.harvard.edu/~purcell/plink/ | |
105 has excellent documentation describing the parameters you can set here. | |
106 | |
107 This implementation is a Galaxy tool wrapper around these third party applications. | |
108 It was originally designed and written for family based data from the CAMP Illumina run of 2007 by | |
109 ross lazarus (ross.lazarus@gmail.com) and incorporated into the rgenetics toolkit. | |
110 | |
111 Rgenetics merely exposes them, wrapping Plink so you can use it in Galaxy. | |
112 | |
113 ----- | |
114 | |
115 **Summary** | |
116 | |
117 Reliable statistical inference depends on reliable data. Poor quality samples and markers | |
118 may add more noise than signal, decreasing statistical power. Removing the worst of them | |
119 can be done by setting thresholds for some of the commonly used technical quality measures | |
120 for genotype data. Of course discordant replicate calls are also very informative but are not | |
121 in scope here. | |
122 | |
123 Marker cleaning: Filters are available to remove markers below a specific minor allele | |
124 frequency, beyond a Hardy Wienberg threshold, below a minor allele frequency threshold, | |
125 or above a threshold for missingness. If family data are available, thresholds for Mendelian | |
126 error can be set. | |
127 | |
128 Subject cleaning: Filters are available to remove subjects with many missing calls. Subjects and markers for family data can be filtered by proportions | |
129 of Mendelian errors in observed transmission. Use the QC reporting tool to | |
130 generate a comprehensive series of reports for quality control. | |
131 | |
132 Note that ancestry and cryptic relatedness should also be checked using the relevant tools. | |
133 | |
134 ----- | |
135 | |
136 .. class:: infomark | |
137 | |
138 **Tip** | |
139 | |
140 You can check that you got what you asked for by running the QC tool to ensure that the distributions | |
141 are truncated the way you expect. Note that you do not expect that the thresholds will be exactly | |
142 what you set - some bad assays and subjects are out in multiple QC measures, so you sometimes have | |
143 more samples or markers than you exactly set for each threshold. Finally, the ordering of | |
144 operations matters and Plink is somewhat restrictive about what it will do on each pass | |
145 of the data. At least it's fixed. | |
146 | |
147 ----- | |
148 | |
149 This Galaxy tool was written by Ross Lazarus for the Rgenetics project | |
150 It uses Plink for most calculations - for full Plink attribution, source code and documentation, | |
151 please see http://pngu.mgh.harvard.edu/~purcell/plink/ plus some custom python code | |
152 | |
153 </help> | |
154 </tool> |