annotate pca_pipeline_def.xml @ 0:cb54350e76ae draft default tip

Uploaded
author jason-ellul
date Wed, 01 Jun 2016 03:24:56 -0400
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
1 <tool id="pca_pipeline" name="PCA Pipeline" version="1.0.0">
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
2 <description>Iterative PCA pipeline</description>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
3 <requirements>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
4 <requirement type="package" version="2.8">Jinja2</requirement>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
5 <!--
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
6 <requirement type="package" version="3.2.1">R</requirement>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
7 <requirement type="package" version="1.2.5">flashpcaR</requirement>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
8 -->
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
9 </requirements>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
10 <command interpreter="python">
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
11 <![CDATA[
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
12 iterative_pca.py
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
13 $datafile
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
14 $data_type
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
15 $iterations
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
16 --sd_cutoff $sd_cutoff
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
17 --absolute_prefix $output.files_path
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
18 --html_file $output
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
19 #if $control_tag
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
20 --control_tag $control_tag
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
21 #end if
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
22 #if $cases_tag
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
23 --cases_tag $cases_tag
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
24 #end if
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
25 #if $data_type.value == "variant_data"
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
26 #if $user_configfile
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
27 --config_file $user_configfile
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
28 #else
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
29 --config_file $cfile
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
30 #end if
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
31 #end if
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
32 #if $clustering_flag.value == "yes"
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
33 --clustering_flag
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
34 --clustering_method $clustering_method
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
35 --cluster_trimming $cluster_trimming
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
36 #end if
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
37 #if $ethnicity_file
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
38 --ethnicity_file $ethnicity_file
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
39 #end if
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
40 #if $xsamples_file
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
41 --reject_samples $xsamples_file
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
42 #end if
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
43 #if $xsnps_file
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
44 --reject_snps $xsnps_file
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
45 #end if
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
46 --galaxy
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
47 ]]>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
48 </command>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
49 <configfiles>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
50 <configfile name="cfile">#control
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
51 control_tag,#Sample,$control_tag
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
52 cases_tag,#Sample,$cases_tag
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
53 #column_names
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
54 genotype_column,$genotype_column
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
55 reference_column,$reference_column
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
56 alternate_column,$alternate_column
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
57 sample_id_column,$sample_id_column
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
58 chromosome_column,$chromosome_column
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
59 position_column,$position_column
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
60 variant_id_column,$variant_id_column
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
61 #numeric_filters
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
62 #for $i, $f in enumerate($numeric_filters)
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
63 $f.filter_name,$f.column_name,$f.operation,$f.cutoff
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
64 #end for
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
65 #string_filters
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
66 #for $i, $s in enumerate($string_filters)
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
67 $s.filter_name,$s.column_name,$s.exact_flag,$s.accept_flag
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
68 $(','.join($s.patterns.split('\n')))
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
69 #end for</configfile>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
70 </configfiles>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
71 <inputs>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
72 <param name="datafile" type="data" label="Input datafile"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
73 <param name="data_type" type="select" label="Type of input data file">
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
74 <option value="variant_data">Variant Data</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
75 <option value="numeric_ped">Numeric PED File</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
76 <option value="rdata">RData file</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
77 </param>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
78 <param name="iterations" type="integer" value="1" label="Number of iterations to complete"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
79 <param name="clustering_flag" type="select" display="radio" label="Do clustering?">
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
80 <option value="yes">Yes</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
81 <option value="no">No</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
82 </param>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
83 <param name="clustering_method" type="select" label="Clustering method (ignore if you selected 'No' for 'Do clustering?')">
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
84 <option value="dbscan">DBSCAN</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
85 <option value="hclust">Hierarchical Clustering</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
86 </param>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
87 <param name="cluster_trimming" type="select" label="Algorithm used to identify and remove cluster outliers (ignore if you selected 'No' for 'Do clustering?')">
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
88 <option value="sd">Standard Deviations</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
89 <option value="mcd">Mean Cluster Distance</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
90 <option value="dbscan_outliers_only">DBSCAN outliers only (Only valid if DBSCAN is selected as 'Algorithm used to find clusters'</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
91 </param>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
92 <param name="sd_cutoff" type="float" value="2" label="Strictness of outlier trimming. Lower = more outliers cut at each stage, Higher = less outliers cut at each stage."/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
93 <!-- Control and cases tag info -->
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
94 <param name="control_tag" type="text" value="LP" label="Control Tag"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
95 <param name="cases_tag" type="text" value="HAPS" label="Cases Tag"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
96 <param name="user_configfile" type="data" format="txt" optional="true" label="Optional user provided config file.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
97 NB:
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
98 - If this is set, ALL the fields below will be ignored.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
99 - If no input is provided, and the input data is a text file containing variant data, ALL the fields below except the filters MUST be filled in"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
100 <param name="ethnicity_file" type="data" format="txt" optional="true" label="Optional file containing data about ethnicity of samples"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
101 <param name="xsamples_file" type="data" format="txt" optional="true" label="Optional file containing EXACT ids of samples to exclude"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
102 <param name="xsnps_file" type="data" format="txt" optional="true" label="Optional file containing EXACT ids of SNPs to exclude"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
103 <!-- Column headers -->
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
104 <param name="sample_id_column" type="text" value="#Sample" label="Sample ID Column"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
105 <param name="variant_id_column" type="text" value="ID" label="Variant ID Column"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
106 <param name="chromosome_column" type="text" value="CHROM" label="Chromosome Column"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
107 <param name="position_column" type="text" value="POS" label="Position Column"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
108 <param name="genotype_column" type="text" value="GT" label="Genotype Column"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
109 <param name="reference_column" type="text" value="REF" label="Reference Allele Column"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
110 <param name="alternate_column" type="text" value="ALT" label="Alternate Allele Column"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
111 <!-- Numeric Filters -->
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
112 <repeat name="numeric_filters" title="Optional Numeric Filters">
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
113 <param name="filter_name" type="text" label="Filter Name"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
114 <param name="column_name" type="text" label="Name of column to filter on"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
115 <param name="operation" type="select" label="Accept if column value is:">
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
116 <option value="g">greater than</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
117 <option value="l">less than</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
118 <option value="e">equal to</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
119 <option value="ge">greater than or equal to</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
120 <option value="le">less than or equal to</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
121 </param>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
122 <param name="cutoff" type="float" value="0" label="Cutoff Value"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
123 </repeat>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
124 <!-- String Filters -->
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
125 <repeat name="string_filters" title="Optional String Filters">
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
126 <param name="filter_name" type="text" label="Filter Name"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
127 <param name="column_name" type="text" label="Name of column to filter on"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
128 <param name="exact_flag" type="select" label="Exact pattern matching?">
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
129 <option value="exact">Yes</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
130 <option value="not_exact">No</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
131 </param>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
132 <param name="accept_flag" type="select" label="Action to perform after a successful match">
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
133 <option value="accept">Accept</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
134 <option value="reject">Reject</option>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
135 </param>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
136 <param name="patterns" type="text" area="true" size="10x35" label="Patterns to match on" help="Enter a list of patterns here, separated by newlines"/>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
137 </repeat>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
138 </inputs>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
139 <outputs>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
140 <data name="output" format="html">
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
141 <label>PCA summary: "${datafile.name}"</label>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
142 </data>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
143 </outputs>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
144 <help><![CDATA[
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
145 .. class:: warningmark
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
146
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
147 '''WARNING''' This tool requires the 'dbscan' (https://cran.r-project.org/web/packages/dbscan/index.html) and 'flashpcaR' (https://github.com/gabraham/flashpca/releases) R packages to be installed on the galaxy instance.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
148
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
149 ======================================
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
150 Principle Component Analysis Pipeline
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
151 ======================================
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
152
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
153 Overview
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
154 --------
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
155 A tool which performs iterative principle component analysis.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
156 The general idea is to seperate patient samples based on their ethnicity, by performing PCA on the variant data of each sample.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
157 After each analysis step, outliers are identified. The PCA is then repeated, with the outliers removed.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
158 This process continues for a set number of iterations specified by the user. After the pipeline completes, the user can see a
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
159 detailed summary, as well as have access to the outliers identified at each iteration.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
160
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
161 Primary Input
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
162 ---------------
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
163 As primary input the tools accepts a single file, which may be formatted in the following ways:
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
164
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
165 - **Variant data file:** This should be a tab-delimited text file, with each row containing data about a single variant site from a single person. If this option is selected, the column names which contain important information must also be specified, either via a configuration file (see below), or through the tool's form fields.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
166 - **Numeric ped file:** See http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml for detailed information on PED format. This tool requires the affection status of each site to be specified numerically i.e.:
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
167
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
168 - 0 = homozygous reference
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
169 - 1 = heterozygous
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
170 - 2 = homozygous alternate
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
171
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
172 rather than consisting of pairs of genotypes for each site.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
173 - **RData file:** File containing stored data from an R session. For this tool the input must meet certain requirements:
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
174
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
175 - The file can only contain a SINGLE R object, which must be a list.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
176 - The list must contain a named 'bed' element.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
177 - The 'bed' element must be an n x m matrix/data frame, where n = number of samples, m = number of unique snps found in all the samples.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
178 - The A(i,j)th entry in the 'bed' matrix should indicate affectation status of the ith sample at the jth SNP site, according to the key for numeric ped files (as above).
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
179 - The row names of the 'bed' matrix must contain the ids of the samples.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
180 - The column names of the 'bed' matrix must contain the ids of the SNPs.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
181
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
182 If these very specific criteria are not met, the tool WILL fail.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
183
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
184
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
185
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
186 Primary Output
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
187 ---------------
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
188
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
189 HTML file containing plots of the PCA for each iteration.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
190 Possible plots, depending on user specified options:
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
191
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
192 - **Control vs Cases Plot:** If control and/or cases tags are provided, this plot will be output. ALL samples are plotted, with controls shown in blue, cases in red, unknown samples in black.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
193 - **Cluster Plot:** Output if user opts to do clustering. Samples are plotted, with clusters colour-coded. Outliers as identified by DBSCAN are always read and use an open circle as the icon. Trimmed clusters use a cross for the icon, instead of a circle. Both the outliers (open circles) AND the rejected clusters (crosses) will be dropped in the next iteration.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
194 - **Outliers Plot:** Output if user does NOT opt to do clustering. Samples which are considered outliers (as described above in 'Detecting outliers without clustering') are plotted as red open circles; all other samples are plotted as green full circles.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
195 - **Standard Deviations Plot:** Samples are colour-coded by standard deviation. Samples which fall within 1 standard devaiton of the median are red, <= 2 sds are green, <= 3 sds are blue, > 3 sds are purple.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
196 - **Ethnicity Plot:** Each ethnicity uses a specific colour and symbol. Fairly self-explanotory. Plot is only output if an ethnicity data file is provided as input.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
197
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
198 Beneath the plots there are also two expandable lists. Samples excluded shows which samples were not part of the PCA for this iteration. This is cumulative. Outliers shows the outliers detected in THIS iteration. Any available data from the ethnicity file (if provided) is also displayed for each excluded sample.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
199
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
200 Options/Secondary Inputs
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
201 ------------------------
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
202 - **Type of input data file:** Either a ped file or a text file as specified above
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
203 - **Number of iterations to complete:** A single iteration would involve performing PCA on the input data, then identifying and removing outliers. Two iterations would involve performing PCA again with the outliers identified from the first iteration excluded, three iterations would exclude the outliers from the first 2 stages, and so on and so forth.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
204 - **Detecting outliers without clustering:** This is done by obtaining the standard deviations of the first two principle components. Any samples whose scores for either of these first two components falls more than 'n' number of standard deviations away from the component median are considered outliers.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
205 - **Clustering:** The user may select from a range of algorithms which will try to identify clusters in the data, with each cluster hopefully corresponding to an ethnic group.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
206 - **Clustering methods:**
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
207
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
208 - *DBSCAN (Density based spatial clustering of applications with noise):*
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
209
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
210 Forms clusters based on density of points, and does not require the number of clusters to be specified beforehand. Good for irregularly shaped, non-spherical clusters. Does NOT require all points to be part of clusters, and produces a set of 'outliers', i.e. points which do not belong to any clusters.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
211
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
212 - *Hierarchical Clustering:*
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
213
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
214 Forms clusters based on distance between points. Tends to result in spherical clusters, but able to handle clusters of varying density. Forces all points to be part of a single cluster. The number of clusters is determined seperately, using the silhouette scores of all the points as a heuristic.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
215
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
216 - **Cluster trimming methods:** All these methods first involve finding the centres of each cluster.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
217
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
218 - *Standard Deviations:*
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
219
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
220 If the centroid of a cluster lies more than ‘n’ standard deviations (n is passed in as a parameter by the user) from the centroid of the entire dataset in either the x or y directions, the entire cluster is cut. If DBSCAN is selected, the outliers it identifies are also cut.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
221
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
222 - *Mean Cluster Distance:*
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
223
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
224 Obtain the average distance between clusters, done by computing the distance between all pairs of clusters and taking the mean. For each cluster, we also compute an average “isolation” value, which is the mean of the distances between that particular cluster and all other clusters. If a cluster’s isolation value is larger than the average cluster distance (multiplied by the strictness weighting), then that cluster is considered an outlier and cut from the next iteration. If DBSCAN is selected, the outliers it identifies are also cut.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
225
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
226 - *DBSCAN outliers only:*
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
227
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
228 Only cut the points identified by the DBSCAN algorithm as not belonging to any cluster. No entire clusters are cut. Obviously this method is only applicable if DBSCAN is selected as the clustering method. THE TOOL WILL NOT RUN IF YOU SELECT THIS OPTION TOGETHER WITH 'Hierarchical Clustering' AS THE CLUSTERING METHOD.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
229
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
230 - **Strictness:** A multiplier used to determine how 'strict' the outlier cutting methods are. For example, if strictness = 1, and we are not doing clustering, all points which lie more than 1 sd from the median are cut. If strictness = 2, all points which lie more than 2 sd from the median are cut, etc.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
231
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
232 - **Control Tag:** A pattern present in the ids of all the control samples, e.g. "LP"
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
233
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
234 - **Cases Tag:** A pattern present in the ids of all the cases samples, e.g. "HAPS"
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
235
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
236 - **Configuration file:** A configuration file to accompany an input variant text file. The config file has a rather specific format, an example is given below::
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
237
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
238 #control
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
239 control_tag,#Sample,HAPS
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
240 cases_tag,#Sample,LP
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
241 #column_names
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
242 genotype_column,GT
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
243 reference_column,REF
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
244 alternate_column,ALT
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
245 sample_id_column,#Sample
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
246 chromosome_column,CHROM
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
247 position_column,POS
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
248 variant_id_column,ID
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
249 #numeric_filters
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
250 strand_bias_filter,Fraction_with_strand_bias,<,0.03
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
251 position_bias_filter,Fraction_with_positional_bias,<,0.03
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
252 count_filter,Num_samples_variant,>,1
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
253 pass_filter,Fraction_samples_passed_filter,>,0.9
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
254 #string_filters
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
255 variant_type_filter,Variant_Type,exact,accept
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
256 SNV
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
257 genotype_filter,GT,exact,accept
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
258 '0/1,'1/1
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
259
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
260 File consists of up to four sections, the starts of which are marked by lines beginning with an octothorpe.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
261
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
262 - *'#control' section:* Indicates substrings found in ids of controls and cases
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
263 - *'#column_names' section:* This is the only required section. First column indicates what column name (in the variant text file) the second column specifies. The same keys i.e. left most column values, as shown in the example must be used, e.g. sample_id_column, the RHS column names must match the names in the variant data file. If using a generated config file, only modify the RHS column, and DO NOT REMOVE ANY rows from this section.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
264 - *'#numeric_filters' section:* Each filter takes up a single line, and is seperated into 4 sections by commas.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
265
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
266 - Column 1: Name of the filter, which is arbitrary
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
267 - Column 2: The name of the column in the variant file to filter on. If this column is not found, a warning is displayed
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
268 - Column 3: The criteria of the filter which must be passed in order for us to accept a particular row. E.g. less than, greater than
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
269 - Column 4: The cutoff value to be compared against.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
270
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
271 - *'#string_filters' section:* Each filter takes up two lines.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
272
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
273 - Line 1, Column 1: Arbitrary filter name
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
274 - Line 1, Column 2: Column name to filter on
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
275 - Line 1, Column 3: Do the patterns have to be exact matches, or just a substrings? E.g. if pattern = "HAPS" and string being compared = "HAPS-909090", if exact was true this would not be a successfull match, whereas if not_exact was true it would be a match.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
276 - Line 1, Column 4: What to do with the row if a successful match is detected, e.g. accept or reject
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
277 - Line 2: A comma seperated list of patterns to match on
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
278
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
279
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
280 - **Ethnicity file:** An ethnicity file containing ethnicity data, and possible other data, on the samples. Note this data is not used to sort the input and has no effect on the PCA itself. It is used only to label the results of the output.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
281
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
282 Requirements:
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
283
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
284 - tab delimited
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
285 - Must have at least two columns
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
286 - First column has sample ID's
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
287 - Second column has ethnicities
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
288 - First row must be a header
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
289
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
290 First few lines of a correctly formatted ethnicity file given below::
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
291
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
292 IID population Halo1.or.2. BloodAge SalivaAge COB ethnicity
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
293 LP-10000001 AUSTRALIAN Halo2 - LP-BC 67 NA Australia australian
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
294 LP-10000003 AUSTRALIAN Halo1 45 NA Australia australian southern_european
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
295 LP-10000005 AUSTRALIAN Halo1 73 NA Australia australian southern_european
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
296 LP-10000008 EUROPE Halo1 54 NA South Eastern Europe south_east_european
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
297 LP-10000009 OTHER Halo1 65 NA Southern & East Africa jewish
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
298
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
299 - **Exclude samples file:** A text file containing exact ids of samples to exclude from the PCA.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
300
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
301 Requirements:
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
302
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
303 - single column
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
304 - sample ids seperated by newlines
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
305 - one sample id per line
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
306
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
307 Example::
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
308
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
309 HAPS-90573
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
310 HAPS-90578R
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
311 HAPS-110542
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
312 HAPS-110605
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
313 HAPS-110620
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
314 HAPS-110638
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
315 HAPS-110649
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
316 HAPS-110668
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
317 HAPS-110799
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
318 HAPS-110813
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
319 HAPS-110959
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
320 HAPS-111186
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
321 HAPS-111298
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
322 HAPS-111404
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
323 HAPS-111493
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
324 HAPS-111512
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
325 HAPS-111538
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
326
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
327 - **Exclude SNPS file:** A text file containing exact ids of SNPs to exclude from the PCA.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
328
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
329 Requirements:
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
330
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
331 - single column
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
332 - snp ids seperated by newlines
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
333 - one snp id per line
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
334
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
335 Example::
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
336
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
337 rs72896283
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
338 rs7534447
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
339 rs4662775
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
340 rs10932813
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
341 rs10932816
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
342 rs12330369
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
343 rs1802904
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
344 rs10902762
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
345 rs9996817
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
346 rs6446393
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
347 rs871133
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
348 rs4301095
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
349 rs941849
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
350 rs6917467
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
351 rs75834296
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
352 rs142922667
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
353
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
354 - **Required Column Headers:** If a variant text file is the primary input, the following information MUST be provided, either through the config file, or by filling out the corresponding fields in the tool submission form.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
355
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
356 - Sample IDs: Name of the column containing the sample ids
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
357 - Chromosome: Name of the column indicating what chromosome the SNP is found on
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
358 - Position: Name of the column indicating at which position on the chromosome the SNP is found
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
359 - Genotype: The genotype of the sample for this site
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
360 - Reference: The 'normal'/'common' genotype for this site
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
361 - Alternate: The alternate genotype for this site
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
362 - Variant IDs: Name of the column indicating the ID of the SNP
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
363
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
364 - **Numeric Filters:** See Configuration file section
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
365 - **String Filters:** See Configuration file section
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
366
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
367 Other Output
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
368 -------------
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
369
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
370 - Tool will output a root folder containing the HTML file and all the plots, placed in directories seperated by iteration.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
371 - If the input data was a variant file, the output folder will also contain a numeric ped file, generated before the first iteration, as well as a config file. The config file is either the exact one passed in by the user, or one automatically generated from the form input, which can be used for future PCA runs.
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
372
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
373 ]]>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
374
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
375
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
376 </help>
cb54350e76ae Uploaded
jason-ellul
parents:
diff changeset
377 </tool>