comparison shm_csr.xml @ 56:ee807645b224 draft

Uploaded
author davidvanzessen
date Mon, 17 Jul 2017 10:44:40 -0400
parents 3be28ac82909
children 1a8e1dd21b16
comparison
equal deleted inserted replaced
55:6cd12c71c3d3 56:ee807645b224
1 <tool id="shm_csr" name="SHM &amp; CSR pipeline" version="1.0"> 1 <tool id="shm_csr" name="SHM &amp; CSR pipeline" version="1.0">
2 <description></description> 2 <description></description>
3 <requirements>
4 <requirement type="package" version="3.1_3">r-seqinr</requirement>
5 <requirement type="package" version="2.2.0">r-ggplot2</requirement>
6 <requirement type="package" version="1.4.2">r-reshape2</requirement>
7 <requirement type="package" version="0.4.1">r-scales</requirement>
8 <requirement type="package" version="1.10.0">r-data.table</requirement>
9 </requirements>
3 <command interpreter="bash"> 10 <command interpreter="bash">
4 #if str ( $filter_unique.filter_unique_select ) == "remove": 11 #if str ( $filter_unique.filter_unique_select ) == "remove":
5 wrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select $filter_unique.filter_unique_clone_count $class_filter_cond.class_filter $empty_region_filter $fast 12 wrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select $filter_unique.filter_unique_clone_count $class_filter_cond.class_filter $empty_region_filter $fast
6 #else: 13 #else:
7 wrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select 2 $class_filter_cond.class_filter $empty_region_filter $fast 14 wrapper.sh $in_file custom $out_file $out_file.files_path "${in_file.name}" "-" $functionality $unique $naive_output_cond.naive_output $naive_output_ca $naive_output_cg $naive_output_cm $naive_output_ce $naive_output_all $filter_unique.filter_unique_select 2 $class_filter_cond.class_filter $empty_region_filter $fast
8 #end if 15 #end if
9 </command> 16 </command>
10 <inputs> 17 <inputs>
11 <param name="in_file" type="data" label="IMGT zip file to be analysed" /> 18 <param name="in_file" type="data" format="data" label="IMGT zip file to be analysed" />
12 <param name="empty_region_filter" type="select" label="Sequence starts at" help="" > 19 <param name="empty_region_filter" type="select" label="Sequence starts at" help="" >
13 <option value="leader" selected="true">Leader: include FR1, CDR1, FR2, CDR2, FR3 in filters</option> 20 <option value="leader" selected="true">Leader: include FR1, CDR1, FR2, CDR2, FR3 in filters</option>
14 <option value="FR1" selected="true">FR1: include CDR1,FR2,CDR2,FR3 in filters</option> 21 <option value="FR1" selected="true">FR1: include CDR1,FR2,CDR2,FR3 in filters</option>
15 <option value="CDR1">CDR1: include FR2,CDR2,FR3 in filters</option> 22 <option value="CDR1">CDR1: include FR2,CDR2,FR3 in filters</option>
16 <option value="FR2">FR2: include CDR2,FR3 in filters</option> 23 <option value="FR2">FR2: include CDR2,FR3 in filters</option>
27 <option value="no">No</option> 34 <option value="no">No</option>
28 </param> 35 </param>
29 <when value="remove"> 36 <when value="remove">
30 <param name="filter_unique_clone_count" size="4" type="integer" label="How many sequences should be in a group to keep 1 of them" value="2" min="2"/> 37 <param name="filter_unique_clone_count" size="4" type="integer" label="How many sequences should be in a group to keep 1 of them" value="2" min="2"/>
31 </when> 38 </when>
39 <when value="keep"></when>
40 <when value="no"></when>
32 </conditional> 41 </conditional>
33 <param name="unique" type="select" label="Remove duplicates based on" help="" > 42 <param name="unique" type="select" label="Remove duplicates based on" help="" >
34 <option value="VGene,CDR3.IMGT.AA,best_match_class">Top.V.Gene, CDR3 (AA), C region</option> 43 <option value="VGene,CDR3.IMGT.AA,best_match_class">Top.V.Gene, CDR3 (AA), C region</option>
35 <option value="VGene,CDR3.IMGT.AA">Top.V.Gene, CDR3 (AA)</option> 44 <option value="VGene,CDR3.IMGT.AA">Top.V.Gene, CDR3 (AA)</option>
36 <option value="CDR3.IMGT.AA,best_match_class">CDR3 (AA), C region</option> 45 <option value="CDR3.IMGT.AA,best_match_class">CDR3 (AA), C region</option>
49 <option value="70_0">>70% class</option> 58 <option value="70_0">>70% class</option>
50 <option value="60_0">>60% class</option> 59 <option value="60_0">>60% class</option>
51 <option value="19_0">>19% class</option> 60 <option value="19_0">>19% class</option>
52 <option value="101_101">Do not assign (sub)class</option> 61 <option value="101_101">Do not assign (sub)class</option>
53 </param> 62 </param>
63 <when value="70_70"></when>
64 <when value="60_55"></when>
65 <when value="70_0"></when>
66 <when value="60_0"></when>
67 <when value="19_0"></when>
68 <when value="101_101"></when>
54 </conditional> 69 </conditional>
55 <conditional name="naive_output_cond"> 70 <conditional name="naive_output_cond">
56 <param name="naive_output" type="select" label="Output new IMGT archives per class into your history?"> 71 <param name="naive_output" type="select" label="Output new IMGT archives per class into your history?">
57 <option value="yes">Yes</option> 72 <option value="yes">Yes</option>
58 <option value="no" selected="true">No</option> 73 <option value="no" selected="true">No</option>
59 </param> 74 </param>
75 <when value="yes"></when>
76 <when value="no"></when>
60 </conditional> 77 </conditional>
61 <param name="fast" type="select" label="Fast" help="Skips generating the new ZIP files and Change-O/Baseline" > 78 <param name="fast" type="select" label="Fast" help="Skips generating the new ZIP files and Change-O/Baseline" >
62 <option value="yes">Yes</option> 79 <option value="yes">Yes</option>
63 <option value="no" selected="true">No</option> 80 <option value="no" selected="true">No</option>
64 </param> 81 </param>
84 <data format="imgt_archive" name="naive_output_all" label = "Filtered IMGT all: ${in_file.name}" > 101 <data format="imgt_archive" name="naive_output_all" label = "Filtered IMGT all: ${in_file.name}" >
85 <filter>naive_output_cond['naive_output'] == "yes"</filter> 102 <filter>naive_output_cond['naive_output'] == "yes"</filter>
86 <filter>class_filter_cond['class_filter'] == "101_101"</filter> 103 <filter>class_filter_cond['class_filter'] == "101_101"</filter>
87 </data> 104 </data>
88 </outputs> 105 </outputs>
106 <tests>
107 <test>
108 <param name="fast" value="yes"/>
109 <output name="out_file" file="test1.html"/>
110 </test>
111 </tests>
112 <help>
113 <![CDATA[
114 **References**
115
116 Yaari, G. and Uduman, M. and Kleinstein, S. H. (2012). Quantifying selection in high-throughput Immunoglobulin sequencing data sets. In *Nucleic Acids Research, 40 (17), pp. e134–e134.* [`doi:10.1093/nar/gks457`_]
117
118 .. _doi:10.1093/nar/gks457: http://dx.doi.org/10.1093/nar/gks457
119
120 Gupta, Namita T. and Vander Heiden, Jason A. and Uduman, Mohamed and Gadala-Maria, Daniel and Yaari, Gur and Kleinstein, Steven H. (2015). Change-O: a toolkit for analyzing large-scale B cell immunoglobulin repertoire sequencing data: Table 1. *In Bioinformatics, 31 (20), pp. 3356–3358.* [`doi:10.1093/bioinformatics/btv359`_]
121
122 .. _doi:10.1093/bioinformatics/btv359: http://dx.doi.org/10.1093/bioinformatics/btv359
123
124 -----
125
126 **Input files**
127
128 IMGT/HighV-QUEST .zip and .txz are accepted as input files. The file to be analysed can be selected using the dropdown menu.
129
130 .. class:: infomark
131
132 Note: Files can be uploaded by using “get data” and “upload file” and selecting “IMGT archive“ as a file type. Special characters should be prevented in the file names of the uploaded samples as these can give errors when running the immune repertoire pipeline. Underscores are allowed in the file names.
133
134 -----
135
136 **Sequence starts at**
137
138 Identifies the region which will be included in the analysis (analysed region)
139
140 - Sequences which are missing a gene region (FR1/CDR1 etc) in the analysed region are excluded.
141 - Sequences containing an ambiguous base in the analysed region or the CDR3 are excluded.
142 - All other filtering/analysis is based on the analysed region.
143
144 -----
145
146 **Functionality filter**
147
148 Allows filtering on productive rearrangements, unproductive rearrangements or both based on the assignment provided by IMGT.
149
150 **Filter unique sequences**
151
152 *Remove unique:*
153
154
155 This filter consists of two different steps.
156
157 Step 1: removes all sequences of which the nucleotide sequence in the “analysed region” and the CDR3 (see sequence starts at filter) occurs only once. (Sub)classes are not taken into account in this filter step.
158
159 Step 2: removes all duplicate sequences (sequences with the exact same nucleotide sequence in the analysed region, the CDR3 and the same (sub)class).
160
161 .. class:: infomark
162
163 This means that sequences with the same nucleotide sequence but a different (sub)class will be included in the results of both (sub)classes.
164
165 *Keep unique:*
166
167 Removes all duplicate sequences (sequences with the exact same nucleotide sequence in the analysed region and the same (sub)class).
168
169 Example of the sequences that are included using either the “remove unique filter” or the “keep unique filter”
170
171 +--------------------------+
172 | unique filter |
173 +--------+--------+--------+
174 | values | remove | keep |
175 +--------+--------+--------+
176 | A | A | A |
177 +--------+--------+--------+
178 | A | B | B |
179 +--------+--------+--------+
180 | B | D | C |
181 +--------+--------+--------+
182 | B | | D |
183 +--------+--------+--------+
184 | C | | |
185 +--------+--------+--------+
186 | D | | |
187 +--------+--------+--------+
188 | D | | |
189 +--------+--------+--------+
190
191 -----
192
193 **Remove duplicates based on**
194
195 Allows the selection of a single sequence per clone. Different definitions of a clone can be chosen.
196
197 .. class:: infomark
198
199 Note: The first sequence (in the data set) of each clone is always included in the analysis. When the first matched sequence is unmatched (no subclass assigned) the first matched sequence will be included. This means that altering the data order (by for instance sorting) can change the sequence which is included in the analysis and therefore slightly influences the results.
200
201 -----
202
203 **Human Class/Subclass filter**
204
205 .. class:: warningmark
206
207 Note: This filter should only be applied when analysing human IGH data in which a (sub)class specific sequence is present. Otherwise please select the do not assign (sub)class option to prevent errors when running the pipeline.
208
209 The class percentage is based on the ‘chunk hit percentage’ (see below). The subclass percentage is based on the ‘nt hit percentage’ (see below).
210
211 The SHM & CSR pipeline identifies human Cµ, Cα, Cγ and Cε constant genes by dividing the reference sequences for the subclasses (NG_001019) in 8 nucleotide chunks which overlap by 4 nucleotides. These overlapping chunks are then individually aligned in the right order to each input sequence. This alignment is used to calculate the chunck hit percentage and the nt hit percentage.
212
213 *Chunk hit percentage*: The percentage of the chunks that is aligned
214
215 *Nt hit percentage*: The percentage of chunks covering the subclass specific nucleotide match with the different subclasses. The most stringent filter for the subclass is 70% ‘nt hit percentage’ which means that 5 out of 7 subclass specific nucleotides for Cα or 6 out of 8 subclass specific nucleotides of Cγ should match with the specific subclass.
216 The option “>25% class” can be chosen when you only are interested in the class (Cα/Cγ/Cµ/Cɛ) of your sequences and the length of your sequence is not long enough to assign the subclasses.
217
218 -----
219
220 **Output new IMGT archives per class into your history?**
221
222 If yes is selected, additional output files (one for each class) will be added to the history which contain information of the sequences that passed the selected filtering criteria. These files are in the same format as the IMGT/HighV-QUEST output files and therefore are also compatible with many other analysis programs, such as the Immune repertoire pipeline.
223
224 -----
225
226 **Execute**
227
228 Upon pressing execute a new analysis is added to your history (right side of the page). Initially this analysis will be grey, after initiating the analysis colour of the analysis in the history will change to yellow. When the analysis is finished it will turn green in the history. Now the analysis can be opened by clicking on the eye icon on the analysis of interest. When an analysis turns red an error has occurred when running the analysis. If you click on the analysis title additional information can be found on the analysis. In addition a bug icon appears. Here more information on the error can be found.
229
230 ]]>
231 </help>
89 <citations> 232 <citations>
90 <citation type="doi">10.1093/nar/gks457</citation> 233 <citation type="doi">10.1093/nar/gks457</citation>
91 <citation type="doi">10.1093/bioinformatics/btv359</citation> 234 <citation type="doi">10.1093/bioinformatics/btv359</citation>
92 </citations> 235 </citations>
93 <help>
94 <![CDATA[
95 **References**
96
97 Yaari, G. and Uduman, M. and Kleinstein, S. H. (2012). Quantifying selection in high-throughput Immunoglobulin sequencing data sets. In *Nucleic Acids Research, 40 (17), pp. e134–e134.* [`doi:10.1093/nar/gks457`_]
98
99 .. _doi:10.1093/nar/gks457: http://dx.doi.org/10.1093/nar/gks457
100
101 Gupta, Namita T. and Vander Heiden, Jason A. and Uduman, Mohamed and Gadala-Maria, Daniel and Yaari, Gur and Kleinstein, Steven H. (2015). Change-O: a toolkit for analyzing large-scale B cell immunoglobulin repertoire sequencing data: Table 1. *In Bioinformatics, 31 (20), pp. 3356–3358.* [`doi:10.1093/bioinformatics/btv359`_]
102
103 .. _doi:10.1093/bioinformatics/btv359: http://dx.doi.org/10.1093/bioinformatics/btv359
104
105 -----
106
107 **Input files**
108
109 IMGT/HighV-QUEST .zip and .txz are accepted as input files. The file to be analysed can be selected using the dropdown menu.
110
111 .. class:: infomark
112
113 Note: Files can be uploaded by using “get data” and “upload file” and selecting “IMGT archive“ as a file type. Special characters should be prevented in the file names of the uploaded samples as these can give errors when running the immune repertoire pipeline. Underscores are allowed in the file names.
114
115 -----
116
117 **Sequence starts at**
118
119 Identifies the region which will be included in the analysis (analysed region)
120
121 - Sequences which are missing a gene region (FR1/CDR1 etc) in the analysed region are excluded.
122 - Sequences containing an ambiguous base in the analysed region or the CDR3 are excluded.
123 - All other filtering/analysis is based on the analysed region.
124
125 -----
126
127 **Functionality filter**
128
129 Allows filtering on productive rearrangements, unproductive rearrangements or both based on the assignment provided by IMGT.
130
131 **Filter unique sequences**
132
133 *Remove unique:*
134
135
136 This filter consists of two different steps.
137
138 Step 1: removes all sequences of which the nucleotide sequence in the “analysed region” and the CDR3 (see sequence starts at filter) occurs only once. (Sub)classes are not taken into account in this filter step.
139
140 Step 2: removes all duplicate sequences (sequences with the exact same nucleotide sequence in the analysed region, the CDR3 and the same (sub)class).
141
142 .. class:: infomark
143
144 This means that sequences with the same nucleotide sequence but a different (sub)class will be included in the results of both (sub)classes.
145
146 *Keep unique:*
147
148 Removes all duplicate sequences (sequences with the exact same nucleotide sequence in the analysed region and the same (sub)class).
149
150 Example of the sequences that are included using either the “remove unique filter” or the “keep unique filter”
151
152 +--------------------------+
153 | unique filter |
154 +--------+--------+--------+
155 | values | remove | keep |
156 +--------+--------+--------+
157 | A | A | A |
158 +--------+--------+--------+
159 | A | B | B |
160 +--------+--------+--------+
161 | B | D | C |
162 +--------+--------+--------+
163 | B | | D |
164 +--------+--------+--------+
165 | C | | |
166 +--------+--------+--------+
167 | D | | |
168 +--------+--------+--------+
169 | D | | |
170 +--------+--------+--------+
171
172 -----
173
174 **Remove duplicates based on**
175
176 Allows the selection of a single sequence per clone. Different definitions of a clone can be chosen.
177
178 .. class:: infomark
179
180 Note: The first sequence (in the data set) of each clone is always included in the analysis. When the first matched sequence is unmatched (no subclass assigned) the first matched sequence will be included. This means that altering the data order (by for instance sorting) can change the sequence which is included in the analysis and therefore slightly influences the results.
181
182 -----
183
184 **Human Class/Subclass filter**
185
186 .. class:: warningmark
187
188 Note: This filter should only be applied when analysing human IGH data in which a (sub)class specific sequence is present. Otherwise please select the do not assign (sub)class option to prevent errors when running the pipeline.
189
190 The class percentage is based on the ‘chunk hit percentage’ (see below). The subclass percentage is based on the ‘nt hit percentage’ (see below).
191
192 The SHM & CSR pipeline identifies human Cµ, Cα, Cγ and Cε constant genes by dividing the reference sequences for the subclasses (NG_001019) in 8 nucleotide chunks which overlap by 4 nucleotides. These overlapping chunks are then individually aligned in the right order to each input sequence. This alignment is used to calculate the chunck hit percentage and the nt hit percentage.
193
194 *Chunk hit percentage*: The percentage of the chunks that is aligned
195
196 *Nt hit percentage*: The percentage of chunks covering the subclass specific nucleotide match with the different subclasses. The most stringent filter for the subclass is 70% ‘nt hit percentage’ which means that 5 out of 7 subclass specific nucleotides for Cα or 6 out of 8 subclass specific nucleotides of Cγ should match with the specific subclass.
197 The option “>25% class” can be chosen when you only are interested in the class (Cα/Cγ/Cµ/Cɛ) of your sequences and the length of your sequence is not long enough to assign the subclasses.
198
199 -----
200
201 **Output new IMGT archives per class into your history?**
202
203 If yes is selected, additional output files (one for each class) will be added to the history which contain information of the sequences that passed the selected filtering criteria. These files are in the same format as the IMGT/HighV-QUEST output files and therefore are also compatible with many other analysis programs, such as the Immune repertoire pipeline.
204
205 -----
206
207 **Execute**
208
209 Upon pressing execute a new analysis is added to your history (right side of the page). Initially this analysis will be grey, after initiating the analysis colour of the analysis in the history will change to yellow. When the analysis is finished it will turn green in the history. Now the analysis can be opened by clicking on the eye icon on the analysis of interest. When an analysis turns red an error has occurred when running the analysis. If you click on the analysis title additional information can be found on the analysis. In addition a bug icon appears. Here more information on the error can be found.
210
211 ]]>
212 </help>
213 </tool> 236 </tool>