Mercurial > repos > xuebing > sharplabtool
comparison tools/rgenetics/rgLDIndep.xml @ 0:9071e359b9a3
Uploaded
author | xuebing |
---|---|
date | Fri, 09 Mar 2012 19:37:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9071e359b9a3 |
---|---|
1 <tool id="rgLDIndep1" name="LD Independent:"> | |
2 <code file="rgLDIndep_code.py"/> | |
3 | |
4 <description>filter high LD pairs - decrease redundancy</description> | |
5 | |
6 <command interpreter="python"> | |
7 rgLDIndep.py '$input_file.extra_files_path' '$input_file.metadata.base_name' '$title1' '$mind' | |
8 '$geno' '$hwe' '$maf' '$mef' '$mei' '$out_file1' | |
9 '$out_file1.files_path' '$window' '$step' '$r2' | |
10 </command> | |
11 | |
12 <inputs> | |
13 <param name="input_file" type="data" label="RGenetics genotype data from your current history" | |
14 size="80" format="pbed" /> | |
15 <param name="title1" type="text" size="80" label="Descriptive title for cleaned genotype file" value="LD_Independent"/> | |
16 <param name="r2" type="float" value = "0.1" | |
17 label="r2 threshold: Select only pairs at or below this r^2 threshold (eg 0.1)" | |
18 help="LD threshold defining LD independent markers" /> | |
19 <param name="window" type="integer" value = "40" label="Window: Window size to limit LD pairwise" | |
20 help = "Bigger is better but time taken blows up exponentially as the window grows!" /> | |
21 <param name="step" type="integer" value = "30" label="Step: Move window this far and recompute" | |
22 help = "Smaller is better but of course, time increases..." /> | |
23 <param name="geno" type="float" label="Maximum Missing Fraction: Markers" value="1.0" /> | |
24 <param name="mind" type="float" value="1.0" label="Maximum Missing Fraction: Subjects"/> | |
25 <param name="mef" type="float" label="Maximum Mendel Error Rate: Family" value="1.0"/> | |
26 <param name="mei" type="float" label="Maximum Mendel Error Rate: Marker" value="1.0"/> | |
27 <param name="hwe" type="float" value="0.0" label="Smallest HWE p value (set to 0 for all)" /> | |
28 <param name="maf" type="float" value="0.0" | |
29 label="Smallest Allowable Minor Allele Frequency (set to 0.0 for all)"/> | |
30 | |
31 </inputs> | |
32 | |
33 <outputs> | |
34 <data format="pbed" name="out_file1" metadata_source="input_file" /> | |
35 </outputs> | |
36 <tests> | |
37 <test> | |
38 | |
39 <param name='input_file' value='tinywga' ftype='pbed' > | |
40 <metadata name='base_name' value='tinywga' /> | |
41 <composite_data value='tinywga.bim' /> | |
42 <composite_data value='tinywga.bed' /> | |
43 <composite_data value='tinywga.fam' /> | |
44 <edit_attributes type='name' value='tinywga' /> | |
45 </param> | |
46 <param name='title1' value='rgLDIndeptest1' /> | |
47 <param name="mind" value="1" /> | |
48 <param name="geno" value="1" /> | |
49 <param name="hwe" value="0" /> | |
50 <param name="maf" value="0" /> | |
51 <param name="mef" value="1" /> | |
52 <param name="mei" value="1" /> | |
53 <param name="window" value="10000" /> | |
54 <param name="step" value="5000" /> | |
55 <param name="r2" value="0.1" /> | |
56 <output name='out_file1' file='rgtestouts/rgLDIndep/rgLDIndeptest1.pbed' ftype='pbed' compare="diff" lines_diff='7'> | |
57 <extra_files type="file" name='rgLDIndeptest1.bim' value="rgtestouts/rgLDIndep/rgLDIndeptest1.bim" compare="sim_size" delta="1000"/> | |
58 <extra_files type="file" name='rgLDIndeptest1.fam' value="rgtestouts/rgLDIndep/rgLDIndeptest1.fam" compare="diff" /> | |
59 <extra_files type="file" name='rgLDIndeptest1.bed' value="rgtestouts/rgLDIndep/rgLDIndeptest1.bed" compare="sim_size" delta = "1000" /> | |
60 </output> | |
61 </test> | |
62 </tests> | |
63 <help> | |
64 | |
65 .. class:: infomark | |
66 | |
67 **Attribution** | |
68 | |
69 This tool relies on Plink from Shaun Purcell. For full documentation, please see his web site | |
70 at http://pngu.mgh.harvard.edu/~purcell/plink/ where there is excellent documentation describing | |
71 the parameters you can set here. | |
72 | |
73 Rgenetics merely exposes them, wrapping Plink so you can use it in Galaxy. | |
74 | |
75 **Summary** | |
76 | |
77 In addition to filtering some marker and sample quality measures, | |
78 this tool reduces the amount of overlapping information, by removing | |
79 most of the duplicate information contained in linkage disequilibrium. This is | |
80 a lossy process and for some methods, signal may be lost. However, this makes | |
81 the dataset far more compact (eg 10% of the original storage size) while still | |
82 being highly informative and less biased for some (note NOT all!) statistical methods. | |
83 This is the Clean tool with additional data reduction via Plink LD pruning. | |
84 Use the Clean tool if you don't want LD pruning - which you don't for most statistical testing. | |
85 For ancestry and relatedness, you may well want LD pruned data as it has | |
86 some specific desirable properties. | |
87 | |
88 **LD** | |
89 | |
90 Pairwise Linkage disequilibrium (LD) measures the extent to which the genotype at one locus | |
91 predicts the state of another locus at the level of an entire population. | |
92 When population LD between a pair of markers is high, | |
93 knowing an individual's genotype at one locus allows confident prediction of the genotype at the other. | |
94 In other words, high LD means information redundancy between markers. For some | |
95 purposes, removing some of this redundancy can improve the performance of some analyses. | |
96 Executing this tool will create a new genotype dataset in your current history containing | |
97 LD independent markers - most of the genetic information is retained but without as much redundancy. | |
98 | |
99 Set a pairwise LD threshold (eg r^2 = 0.2) and the (smaller) resulting dataset will have no | |
100 pairs of marker with r^2 greater than 0.2. Additional filters are available to remove markers | |
101 below a specific minor allele frequency, or above a specific level of missingness, | |
102 and to remove subjects using similar criteria. Subjects and markers for family data can be | |
103 filtered by proportions of Mendelian errors in observed transmission. | |
104 | |
105 ----- | |
106 | |
107 **Syntax** | |
108 | |
109 - **Genotype data** is the input pedfile chosen from available library files | |
110 - **New name** is the name to use for the filtered output file | |
111 - **Missfrac threshold: subjects** is the threshold for missingness by subject. Subjects with more than this fraction missing will be excluded from the import | |
112 - **Missfrac threshold: markers** is the threshold for missingness by marker. Markers with more than this fraction missing will be excluded from the import | |
113 - **MaxMendel Individuals** Mendel error fraction above which to exclude subjects with more than the specified fraction of mendelian errors in transmission (for family data only) | |
114 - **MaxMendel Families** Mendel error fraction above which to exclude families with more than the specified fraction of mendelian errors in transmission (for family data only) | |
115 - **HWE** is the threshold for HWE test p values below which the marker will not be imported. Set this to -1 and all markers will be imported regardless of HWE p value | |
116 - **MAF** is the threshold for minor allele frequency - SNPs with lower MAF will be excluded | |
117 - **r^2** is the pairwise LD threshold as r^2. Lower -> less marker redundancy -> fewer markers | |
118 - **Window** is the window width for LD threshold. Bigger -> slower -> more complete | |
119 - **Skip** is the distance to move the window along the genome. Should be window or less. | |
120 | |
121 ----- | |
122 | |
123 **Disclaimer** | |
124 | |
125 This tool relies on Plink from Shaun Purcell. For full documentation, please see his web site | |
126 at http://pngu.mgh.harvard.edu/~purcell/plink/ where thereis excellent documentation describing | |
127 the parameters you can set here. Rgenetics merely exposes them, and wraps Plink so you can use it in Galaxy. | |
128 | |
129 This tool is designed to create genotype data files with more or less LD independent sets of markers. These | |
130 reduced genotype data files are particularly useful for purposes such as evaluating | |
131 ancestry (eg eigenstrat) or relatedness (eg rgGRR) | |
132 | |
133 LD pruning decreases redundancy among the genotype data by removing one of each pair of markers | |
134 in strong LD (above the r^2 threshold) over successive genomic windows (the Window parameter), | |
135 skipping (the Skip parameter bases between windows. The defaults should produce useable outputs. | |
136 | |
137 This might be more efficient for rgGRR and | |
138 eigenstrat...The core quote is | |
139 | |
140 "This generates the same output files as the first version; | |
141 the only difference is that a simple pairwise threshold is used. | |
142 The first two parameters (50 and 5) are the same as above (window size and step); | |
143 the third parameter represents the r^2 threshold. | |
144 Note: this represents the pairwise SNP-SNP metric now, not the | |
145 multiple correlation coefficient; also note, this is based on the | |
146 genotypic correlation, i.e. it does not involve phasing. | |
147 " | |
148 | |
149 ----- | |
150 | |
151 | |
152 | |
153 This Galaxy tool was written by Ross Lazarus for the Rgenetics project | |
154 It uses Plink for most calculations - for full Plink attribution, source code and documentation, | |
155 please see http://pngu.mgh.harvard.edu/~purcell/plink/ plus some custom python code | |
156 | |
157 </help> | |
158 </tool> |