Mercurial > repos > gbcs-embl-heidelberg > je_markdupes
annotate je-markdupes.xml @ 4:384b972e43bc draft
planemo upload for repository https://git.embl.de/grp-gbcs/Je/tree/master/src/galaxy commit 1c7680ac36888721d2c0b72f4e6d27415fe07b43
author | gbcs-embl-heidelberg |
---|---|
date | Tue, 01 Aug 2017 03:43:03 -0400 |
parents | 4ccf1406832d |
children | e2d1b5e1eb11 |
rev | line source |
---|---|
3
4ccf1406832d
planemo upload for repository https://git.embl.de/grp-gbcs/Je/tree/master/src/galaxy commit dd9e62bdb01d1252a90ce778103ce9b6b4a8cd52-dirty
gbcs-embl-heidelberg
parents:
0
diff
changeset
|
1 <tool id="je_markdupes" name="Je-MarkDuplicates" version="@VERSION_STRING@"> |
0 | 2 <description>to filter BAM files for read duplicates taking UMIs into account</description> |
3 <macros> | |
4 <import>macros.xml</import> | |
5 </macros> | |
6 <stdio> | |
7 <exit_code range="1:" level="fatal" description="Tool exception" /> | |
8 </stdio> | |
9 <version_command>echo '1.0'</version_command> | |
10 <command interpreter="bash"> | |
11 <![CDATA[ | |
12 je markdupes | |
13 | |
14 ## picard MarkDuplicates defaults | |
15 INPUT="${inputFile}" | |
16 OUTPUT="${outFile}" | |
17 | |
18 METRICS_FILE="${metrics_file}" | |
19 | |
20 REMOVE_DUPLICATES="${remove_duplicates}" | |
21 ASSUME_SORTED="${assume_sorted}" | |
22 | |
23 #for $element in $adv_options.comments: | |
24 COMMENT="${element.comment}" | |
25 #end for | |
26 | |
27 DUPLICATE_SCORING_STRATEGY="${adv_options.duplicate_scoring_strategy}" | |
28 | |
29 #import pipes | |
30 READ_NAME_REGEX=${ pipes.quote( str( $adv_options.read_name_regex ) ) or "''" } | |
31 OPTICAL_DUPLICATE_PIXEL_DISTANCE="${adv_options.optical_duplicate_pixel_distance}" | |
32 | |
33 VALIDATION_STRINGENCY="${adv_options.validation_stringency}" | |
34 QUIET=true | |
35 VERBOSITY=ERROR | |
36 | |
37 ## Je Markdupes Specific | |
38 MM=${MM} | |
39 #if str($MAX_N) != "": | |
40 MAX_N=${MAX_N} | |
41 #end if | |
42 @barcode_option_cmd@ | |
43 | |
44 #for $i, $option in enumerate( $repeat_slots ) | |
45 #if str($option.SLOTS) != "": | |
46 SLOTS=${option.SLOTS} | |
47 #end if | |
48 #end for | |
49 | |
50 #if str($trim_conditional.T) == "true": | |
51 T=${trim_conditional.T} | |
52 #for $i, $option in enumerate( $trim_conditional.repeat_tslots ) | |
53 #if str($option.TSLOTS) != "": | |
54 TSLOTS=${option.TSLOTS} | |
55 #end if | |
56 #end for | |
57 #end if | |
58 ]]> | |
59 </command> | |
60 <configfiles> | |
61 <expand macro="barcode_config_file"></expand> | |
62 </configfiles> | |
63 | |
64 <inputs> | |
65 <param format="bam,sam" name="inputFile" type="data" label="Select SAM/BAM dataset" | |
66 help="If empty, upload or import a SAM/BAM dataset"/> | |
67 <param name="remove_duplicates" type="boolean" label="If true do not write duplicates to the output file | |
68 instead of writing them with appropriate flags set" help="REMOVE_DUPLICATES; default=False"/> | |
69 <param name="assume_sorted" type="boolean" label="Assume the input file is already sorted" checked="true" | |
70 truevalue="true" falsevalue="false" help="ASSUME_SORTED; default=True"/> | |
71 <conditional name="barcodes"> | |
72 <param name="barcode_list_type_con" type="select" label="Do you have a predefined list of UMIs"> | |
73 <option value="file" selected="true">A one column txt file from the history</option> | |
74 <option value="text">Paste the UMI list in a text field</option> | |
75 <option value="no_barcodes">No predefined list</option> | |
76 </param> | |
77 | |
78 <when value="file"> | |
79 <param name="BARCODE_FILE" type="data" format="tabular,txt" label="UMI file" | |
80 help="BARCODE_FILE. Pre-defined list of Unique Molecular Identifiers that can be expected. | |
81 Format: one column text file, one UMI per line. All UMIs MUST have the same length."/> | |
82 </when> | |
83 | |
84 <when value="text"> | |
85 <param name="barcode_text" type="text" area="True" size="10x30" | |
86 value="barcode\n" label="Barcode file" | |
87 help="BARCODE_FILE. Pre-defined list of Unique Molecular Identifiers that can be expected. | |
88 Format: one column text file, one UMI per line. All UMIs MUST have the same length."> | |
89 <sanitizer> | |
90 <valid initial="string.printable"></valid> | |
91 <mapping initial="none"/> | |
92 </sanitizer> | |
93 </param> | |
94 </when> | |
95 <when value="no_barcodes"/> | |
96 </conditional> | |
97 <repeat name="repeat_slots" min="1" title="Unique Molecular Identifier location"> | |
98 <param name="SLOTS" type="text" value="-1" label="Where to find the UMIs in the read name" | |
99 help="SLOTS. The last position is considered by default (-1). See help below."/> | |
100 </repeat> | |
101 <param name="MM" type="integer" value="1" min="0" | |
102 label="Number of maximum mismatches to consider two Unique Molecular Identifiers (UMIs) similar" | |
103 help="MISMATCHES"/> | |
104 <param name="MAX_N" type="text" value="" label="Maximum number of Ns a UMI can contain" | |
105 help="MAX_NUMBER_OF_N. Above this value, reads are placed in a 'undefined' group. | |
106 Default value is the MISMATCHES number."/> | |
107 <param name="SPLIT" type="text" value=":" label="Character to split up the header" help="SPLIT"/> | |
108 <conditional name="trim_conditional"> | |
109 <param name="T" type="select" | |
110 label="Should barcode information be removed from read names in the output BAM" help="TRIM_HEADERS"> | |
111 <option value="true">Yes</option> | |
112 <option value="false" selected="true">No</option> | |
113 </param> | |
114 <when value="true"> | |
115 <repeat name="repeat_tslots" min="1" title="Unique Molecular Identifier location for trimming"> | |
116 <param name="TSLOTS" type="text" value="-1" | |
117 label="Where to find the UMIs in the read name that should be removed from the header" | |
118 help="TSLOTS. Value for SLOTS is considered by default. See help below"/> | |
119 </repeat> | |
120 </when> | |
121 <when value="false"/> | |
122 </conditional> | |
123 <section name="adv_options" title="Advanced Options" expanded="False"> | |
124 <repeat name="comments" title="Comment" min="0" help="You can provide multiple comments"> | |
125 <param name="comment" type="text" label="Add this comment to BAM dataset"/> | |
126 </repeat> | |
127 | |
128 <param name="duplicate_scoring_strategy" type="select" label="The scoring strategy for choosing the | |
129 non-duplicate among candidates" help="DUPLICATE_SCORING_STRATEGY; default=SUM_OF_BASE_QUALITIES"> | |
130 <option value="SUM_OF_BASE_QUALITIES">SUM_OF_BASE_QUALITIES</option> | |
131 <option value="TOTAL_MAPPED_REFERENCE_LENGTH">TOTAL_MAPPED_REFERENCE_LENGTH</option> | |
132 </param> | |
133 | |
134 <param name="read_name_regex" type="text" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*." | |
135 label="Regular expression that can be used to parse read names in the incoming SAM/BAM dataset" | |
136 help="READ_NAME_REGEX; Read names are parsed to extract three variables: tile/region, x coordinate and | |
137 y coordinate. These values are used to estimate the rate of optical duplication in order to give a more | |
138 accurate estimated library size. See help below for more info; | |
139 default=[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*."> | |
140 <sanitizer> | |
141 <valid initial="string.printable"> | |
142 </valid> | |
143 </sanitizer> | |
144 </param> | |
145 <param name="optical_duplicate_pixel_distance" type="integer" value="100" min="0" max="500" | |
146 label="The maximum offset between two duplicte clusters in order to consider them optical duplicates" | |
147 help="OPTICAL_DUPLICATE_PIXEL_DISTANCE; default=100"/> | |
148 | |
149 <param name="validation_stringency" type="select" label="Select validation stringency" | |
150 help="Setting stringency to SILENT can improve performance when processing a BAM file in which | |
151 variable-length data (read, qualities, tags) do not otherwise need to be decoded."> | |
152 <option value="LENIENT" selected="True">Lenient</option> | |
153 <option value="SILENT">Silent</option> | |
154 <option value="STRICT">Strict</option> | |
155 </param> | |
156 </section> | |
157 </inputs> | |
158 <outputs> | |
159 <data format="bam" name="outFile" label="${tool.name} on ${on_string}: Je-MarkDuplicates BAM output"/> | |
160 <data format="txt" name="metrics_file" label="${tool.name} on ${on_string}: Je-MarkDuplicate metrics"/> | |
161 </outputs> | |
162 | |
163 <tests> | |
164 <test> | |
165 <!-- picard markduplicates default test --> | |
166 <param name="inputFile" value="markdupes_DNase_sorted.bam" ftype="bam"/> | |
167 <param name="barcode_list_type_con" value="file"/> | |
168 <param name="BARCODE_FILE" value="markdupes_umis.txt" ftype="txt"/> | |
169 <param name="repeat_slots_0|SLOTS" value="-1"/> | |
170 <param name="repeat_slots_1|SLOTS" value="-2"/> | |
171 <param name="MM" value="2"/> | |
172 <param name="MAX_N" value="1"/> | |
173 <param name="comment" value="test-run"/> | |
174 <param name="assume_sorted" value="True"/> | |
175 <param name="remove_duplicates" value="True"/> | |
176 <param name="read_name_regex" value="[a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*."/> | |
177 <param name="optical_duplicate_pixel_distance" value="100"/> | |
178 <param name="duplicate_scoring_strategy" value="SUM_OF_BASE_QUALITIES"/> | |
179 <param name="validation_stringency" value="LENIENT"/> | |
180 <output name="outFile" file="markdupes_DNase_sorted_marked.bam" ftype="bam" lines_diff="2"/> | |
181 <output name="metrics_file" file="markdupes_metrics.txt" ftype="txt" lines_diff="4"/> | |
182 </test> | |
183 </tests> | |
184 | |
185 | |
186 <help> | |
187 <![CDATA[ | |
188 **What it does** | |
189 | |
190 Je MarkDupes: Examines aligned records in the supplied SAM or BAM file to locate duplicate molecules taking into account | |
191 molecular barcodes (Unique Molecular Identifiers or UMIs) found in read header. | |
192 All records are then either written to the output file with the duplicate records flagged or trashed. | |
193 | |
194 Input file is a bam file. | |
195 | |
196 Author: Charles Girardot (charles.girardot@embl.de). | |
197 | |
198 Wrapper by: Jelle Scholtalbers (jelle.scholtalbers@embl.de). | |
199 | |
200 ------ | |
201 | |
202 **Know what you are doing** | |
203 | |
204 .. class:: warningmark | |
205 | |
206 You will want to read the `documentation`__. | |
207 | |
208 .. __: http://gbcs.embl.de/portal/Je | |
209 | |
210 ------ | |
211 | |
212 **Parameter list** | |
213 | |
214 This is an exhaustive list of options:: | |
215 | |
216 INPUT=String | |
217 I=String | |
218 | |
219 One or more input SAM or BAM files to analyze. Must be coordinate sorted. | |
220 | |
221 Default value: null. This option may be specified 0 or more times. | |
222 | |
223 OUTPUT=File | |
224 O=File | |
225 | |
226 The output file to write marked records to | |
227 | |
228 Required. | |
229 | |
230 MISMATCHES=Integer | |
231 MM=Integer | |
232 | |
233 Number of MisMatches (inclusive) to still consider two Unique Molecular Identifiers | |
234 (UMIs) the same i.e. this option buffers for sequencing errors. | |
235 Indeed, in case of a sequencing error, 2 duplicate reads would not be considered | |
236 duplicates anymore. | |
237 Note that N are not considered mismatches during comparison ie ATTNGG and NTTANG are seen | |
238 as the same barcode and these two reads would be flagged duplicates. | |
239 This option takes a single value even when several barcodes are present (see SLOTS). | |
240 Note that when declaring several barcodes (see SLOTS) AND providing a predefined set | |
241 of barcodes (see BC option), the MM value is applicable in each lookup. When a predefined | |
242 set of barcodes is NOT given, the different barcodes (SLOTS) are concatenated first and | |
243 the MM value is therefore considered *overall* as the concatenated code is seen as a | |
244 unique code. | |
245 MM=null is like MM=0 | |
246 Use the minimum Hamming distance of the original barcode set (if applicable). | |
247 | |
248 Required. | |
249 | |
250 MAX_NUMBER_OF_N=Integer | |
251 MAX_N=Integer | |
252 | |
253 Maximum number of Ns a molecular code can contain (inclusive). Above this value, reads | |
254 are placed in a UNDEF group. | |
255 More precisely, these 'too degenarate' codes will not : | |
256 * be compared to the list of predefined codes [predefined code list situation ie BC | |
257 option given] nor | |
258 * be considered as a potential independent code [no predefined code list situation ie | |
259 BC option not given] | |
260 Default value is the MISMATCHES number. | |
261 Note that when declaring several barcodes (see SLOTS) AND providing a predefined set | |
262 of barcodes (see BC option), the MAX_N value is applicable to each barcode. When a | |
263 predefined set | |
264 of barcodes is NOT given, the different barcodes (SLOTS) are concatenated first and the | |
265 MAX_N value | |
266 is therefore considered *overall*. | |
267 | |
268 Default value: null. | |
269 | |
270 | |
271 SLOTS=Integer | |
272 SLOTS=Integer | |
273 | |
274 Where to find the UMIs (and only the UMIs) in the read name once read name has been | |
275 tokenized using the SPLIT character (e.g. ':'). | |
276 By default, the UMI is considered to be found at the end of the read header i.e. after | |
277 the last ':'. Use this option to indicate other or additional UMI positions (e.g. | |
278 multiple UMIs present in read header. | |
279 IMPORTANT: counting starts at 1 and negative numbers can be used to start counting from | |
280 the end. | |
281 For example, consider the following read name that lists 3 different barcodes in the end: | |
282 HISEQ:44:C6KC0ANXX:8:2112:20670:79594:CGATGTTT:GATCCTAG:AAGGTACG | |
283 to indicate that the three barcodes are molecular codes, use | |
284 SLOTS=-1 SLOTS=-2 SLOTS=-3 | |
285 if only the 2 last ones should be considered (the third one being a sample encoding | |
286 barcode), use | |
287 SLOTS=-1 SLOTS=-2 | |
288 | |
289 Default value: null. This option may be specified 0 or more times. | |
290 | |
291 BARCODE_FILE=File | |
292 BC=File | |
293 | |
294 Pre-defined list of UMIs that can be expected. Format: one column text file, one barcode | |
295 per line. All UMIs MUST have the same length. | |
296 | |
297 Default value: null. | |
298 | |
299 TRIM_HEADERS=Boolean | |
300 T=Boolean | |
301 | |
302 Should barcode information be removed from read names in the output BAM? | |
303 | |
304 Default value: false. This option can be set to 'null' to clear the default value. | |
305 Possible values: {true, false} | |
306 | |
307 TSLOTS=Integer | |
308 TSLOTS=Integer | |
309 | |
310 Where to find *all* barcode(s) (i.e. sample encoding and UMIs) in the read name once has | |
311 been tokenized using the SPLIT character (e.g. ':'). | |
312 This option is only considered when TRIM_HEADERS=true. When TSLOTS is ommited while | |
313 TRIM_HEADERS=true, the values of SLOTS apply. | |
314 IMPORTANT : counting starts at 1 and negative numbers can be used to start counting from | |
315 the end. | |
316 See SLOT help for examples. | |
317 | |
318 Default value: null. This option may be specified 0 or more times. | |
319 | |
320 SPLIT_CHAR=String | |
321 SPLIT=String | |
322 | |
323 Character to use to split up the read header line, default is ':'. | |
324 | |
325 Default value: ':'. This option can be set to 'null' to clear the default value. | |
326 | |
327 INPUT=String | |
328 I=String | |
329 | |
330 One or more input SAM or BAM files to analyze. Must be coordinate sorted. | |
331 | |
332 Default value: null. This option may be specified 0 or more times. | |
333 | |
334 OUTPUT=File | |
335 O=File | |
336 | |
337 The output file to write marked records to Required. | |
338 | |
339 METRICS_FILE=File | |
340 M=File | |
341 | |
342 File to write duplication metrics to Required. | |
343 | |
344 COMMENT=String | |
345 CO=String | |
346 | |
347 Comment(s) to include in the output file's header. | |
348 | |
349 Default value: null. This option may be specified 0 or more times. | |
350 | |
351 REMOVE_DUPLICATES=Boolean | |
352 | |
353 If true do not write duplicates to the output file instead of writing them with | |
354 appropriate flags set. | |
355 | |
356 Default value: false. This option can be set to 'null' to clear | |
357 the default value. | |
358 Possible values: {true, false} | |
359 | |
360 ASSUME_SORTED=Boolean | |
361 AS=Boolean | |
362 | |
363 If true, assume that the input file is coordinate sorted even if the header says | |
364 otherwise. | |
365 | |
366 Default value: false. This option can be set to 'null' to clear the default | |
367 value. | |
368 Possible values: {true, false} | |
369 | |
370 DUPLICATE_SCORING_STRATEGY=ScoringStrategy | |
371 DS=ScoringStrategy | |
372 | |
373 The scoring strategy for choosing the non-duplicate among candidates. | |
374 | |
375 Default value: SUM_OF_BASE_QUALITIES. This option can be set to 'null' to clear the default value. | |
376 Possible values: {SUM_OF_BASE_QUALITIES, TOTAL_MAPPED_REFERENCE_LENGTH} | |
377 | |
378 READ_NAME_REGEX=String | |
379 | |
380 Regular expression that can be used to parse read names in the incoming SAM file. Read | |
381 names are parsed to extract three variables: tile/region, x coordinate and y coordinate. | |
382 These values are used to estimate the rate of optical duplication in order to give a more | |
383 accurate estimated library size. Set this option to null to disable optical duplicate | |
384 detection. The regular expression should contain three capture groups for the three | |
385 variables, in order. It must match the entire read name. Note that if the default regex | |
386 is specified, a regex match is not actually done, but instead the read name is split on | |
387 colon character. For 5 element names, the 3rd, 4th and 5th elements are assumed to be | |
388 tile, x and y values. For 7 element names (CASAVA 1.8), the 5th, 6th, and 7th elements | |
389 are assumed to be tile, x and y values. | |
390 | |
391 Default value: | |
392 [a-zA-Z0-9]+:[0-9]:([0-9]+):([0-9]+):([0-9]+).*. This option can be set to 'null' to | |
393 clear the default value. | |
394 | |
395 OPTICAL_DUPLICATE_PIXEL_DISTANCE=Integer | |
396 | |
397 The maximum offset between two duplicte clusters in order to consider them optical | |
398 duplicates. This should usually be set to some fairly small number (e.g. 5-10 pixels) | |
399 unless using later versions of the Illumina pipeline that multiply pixel values by 10, in | |
400 which case 50-100 is more normal. | |
401 | |
402 Default value: 100. This option can be set to 'null' | |
403 to clear the default value. | |
404 | |
405 ]]> | |
406 </help> | |
407 | |
408 </tool> |