comparison je-demultiplex-illu.xml @ 0:111ba1180318 draft

Initial upload
author gbcs-embl-heidelberg
date Wed, 25 Nov 2015 12:36:37 -0500
parents
children 01fdc6d10660
comparison
equal deleted inserted replaced
-1:000000000000 0:111ba1180318
1 <tool id="je_demultiplex_illu" name="Je-Demultiplex-Illu" version="1.0">
2 <description>demultiplexes fastq files using Illumina Index file</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <stdio>
7 <exit_code range="1:" level="fatal" description="Tool exception" />
8 </stdio>
9 <version_command>echo '1.0'</version_command>
10 <command interpreter="bash">
11 <![CDATA[
12 je demultiplex-illu
13
14 ## Fastq inputs
15 @single_or_paired_illu_cmd@
16
17 @barcode_option_cmd@
18
19 #if str($INTERNAL_BARCODES_CON.INTERNAL_BARCODES) == 'true':
20 BPOS=${INTERNAL_BARCODES_CON.BPOS}
21 C=${INTERNAL_BARCODES_CON.CLIP_BARCODE}
22 #if str( $INTERNAL_BARCODES_CON.LEN ) != "":
23 BCLEN=$INTERNAL_BARCODES_CON.LEN
24 #end if
25 #else:
26 BPOS=NONE
27 C=false
28 #end if
29
30 @common_options_cmd@
31
32 @demultiplexer_common_output_options_cmd@
33 @demultiplexer_common_outputs_cmd@
34 ]]>
35 </command>
36 <configfiles>
37 <expand macro="barcode_config_file"/>
38 </configfiles>
39 <inputs>
40 <!-- single/paired - similar to macro 'single_or_paired_general' -->
41 <expand macro="single_or_paired_illu">
42 <expand macro="demultiplex_illu_paired_end_options"/>
43 </expand>
44 <expand macro="barcode_option"/>
45 <conditional name="INTERNAL_BARCODES_CON">
46 <param name="INTERNAL_BARCODES" type="select"
47 label="Do your reads contain Unique Molecular Identifiers(UMIs)">
48 <option value="true">Yes</option>
49 <option value="false" selected="true">No</option>
50 </param>
51 <when value="true">
52 <param name="BPOS" type="select" label="Barcode read position (BPOS)" help="where are the barcodes.
53 If not using paired-end it does not matter what you specify here.">
54 <option value="READ_1" selected="true">READ_1 (beginning of read from the first fastq file)</option>
55 <option value="READ_2">READ_2 (beginning of read from the second fastq file)</option>
56 <option value="BOTH">BOTH (beginning of both reads)</option>
57 </param>
58 <expand macro="barcode_len_option"/>
59 <expand macro="clip_barcode"/>
60 </when>
61 <when value="false"/>
62 </conditional>
63
64 <expand macro="demultiplexer_common_options"/>
65
66 <expand macro="common_options"/>
67
68 <expand macro="demultiplexer_common_output_options"/>
69
70 </inputs>
71 <outputs>
72 <expand macro="demultiplexer_common_outputs"/>
73 </outputs>
74
75 <tests>
76 <test>
77 <!-- barcode at both ends, non-redundant -->
78 <param name="type" value="paired"/>
79 <param name="input_1" value="illu_file_1_sequence.txt" ftype="fastqsanger"/>
80 <param name="input_2" value="illu_file_2_sequence.txt" ftype="fastqsanger"/>
81 <param name="I1" value="illu_file_1_index.txt" ftype="fastqsanger"/>
82 <param name="I2_AVAILABLE" value="true"/>
83 <param name="I2" value="illu_file_2_index.txt" ftype="fastqsanger"/>
84
85 <param name="INTERNAL_BARCODES" value="true"/>
86 <param name="barcode_list_type_con" value="file"/>
87 <param name="BARCODE_FILE" value="illu_dualindexing.txt" ftype="tabular"/>
88 <param name="LEN" value="8:8"/>
89 <param name="ZT" value="5:6"/>
90 <param name="BPOS" value="BOTH"/>
91 <param name="BM" value="BOTH"/>
92 <param name="BRED" value="false"/>
93 <param name="MM" value="3"/>
94 <param name="MMD" value="2"/>
95 <param name="Q" value="20"/>
96 <param name="DIAG" value="false"/>
97 <output name="METRICS_FILE_NAME" file="illu_summary_PE.txt" ftype="tabular" lines_diff="4">
98 <discovered_dataset designation="unassigned_1" file="illu_unassigned_1_PE.txt" />
99 <discovered_dataset designation="unassigned_2" file="illu_unassigned_2_PE.txt" />
100 <discovered_dataset designation="emb681m5_GGACTCCTCTCTCTAT_2" file="emb681m5_GGACTCCTCTCTCTAT_2.txt"/>
101 <discovered_dataset designation="emb681m5_GGACTCCTCTCTCTAT_1" file="emb681m5_GGACTCCTCTCTCTAT_1.txt"/>
102 <discovered_dataset designation="emb681m4_TCCTGAGCCTCTCTAT_2" file="emb681m4_TCCTGAGCCTCTCTAT_2.txt"/>
103 <discovered_dataset designation="emb681m4_TCCTGAGCCTCTCTAT_1" file="emb681m4_TCCTGAGCCTCTCTAT_1.txt"/>
104 <discovered_dataset designation="emb681m1_TAAGGCGACTCTCTAT_2" file="emb681m1_TAAGGCGACTCTCTAT_2.txt"/>
105 <discovered_dataset designation="emb681m1_TAAGGCGACTCTCTAT_1" file="emb681m1_TAAGGCGACTCTCTAT_1.txt"/>
106 <discovered_dataset designation="emb6801m2_AGGCAGAATAGATCGC_2" file="emb6801m2_AGGCAGAATAGATCGC_2.txt"/>
107 <discovered_dataset designation="emb6801m2_AGGCAGAATAGATCGC_1" file="emb6801m2_AGGCAGAATAGATCGC_1.txt"/>
108 <discovered_dataset designation="emb6801m1_CGTACTAGTAGATCGC_2" file="emb6801m1_CGTACTAGTAGATCGC_2.txt"/>
109 <discovered_dataset designation="emb6801m1_CGTACTAGTAGATCGC_1" file="emb6801m1_CGTACTAGTAGATCGC_1.txt"/>
110 </output>
111 </test>
112 </tests>
113
114
115 <help>
116 <![CDATA[
117 **What it does**
118
119 Je demultiplex-illu: demultiplex fastq files using Illumina Index files,
120 with optional handling of Unique Molecular Identifiers for further use in 'markdupes' module
121 Input files are fastq files, and can be in gzip compressed format.
122
123 Author: Charles Girardot (charles.girardot@embl.de).
124
125 Wrapper by: Jelle Scholtalbers (jelle.scholtalbers@embl.de).
126
127 ------
128
129 **Know what you are doing**
130
131 .. class:: warningmark
132
133 You will want to read the `documentation`__.
134
135 .. __: http://gbcs.embl.de/portal/Je
136
137 ------
138
139 **Parameter list**
140
141 This is an exhaustive list of options::
142
143 FASTQ_FILE1=File
144 F1=File
145
146 Input fastq file (optionally gzipped) for single end data, or first read in paired end
147 data.
148
149 Required.
150
151 FASTQ_FILE2=File
152 F2=File
153
154 Input fastq file (optionally gzipped) for the second read of paired end data.
155
156 Default value: null.
157
158 INDEX_FILE1=File
159 I1=File
160
161 Fastq file for index 1 (barcode) reads, optionally gzipped
162
163 Required.
164
165 INDEX_FILE2=File
166 I2=File
167
168 Fastq file for index 2 (barcode) reads, optionally gzipped.
169 A INDEX_FILE1 MUST be provided when INDEX_FILE2 is given. This situation corresponds to
170 Illumina dual indexing.
171
172 Default value: null.
173
174 BARCODE_FILE=File
175 BF=File
176
177 Barcode file describing sequence list and sample names. Tab-delimited file with 2
178 columns, with the sample in col1 and the corresponding barcode in col2.
179 Simple barcode file format : 2 tab-delimited colums
180 If multiple barcode map to the same sample, either line can be duplicated e.g.
181 sample1 ATAT
182 sample1 GAGG
183 sample2 CCAA
184 sample2 TGTG
185 Or barcodes can be combined using the OR operator '|' i.e. the file above can be
186 re-written like
187 sample1 ATAT|GAGG
188 sample2 CCAA|TGTG
189 Finally, for the special situation of paired-end data in which barcodes differ at both
190 ends (ie BPOS=BOTH BRED=false BM=BOTH , see BRED option description), barcodes for read_1
191 and read_2 can be distinguished using a ':' separator i.e.
192 sample1 ATAT:GAGG
193 sample2 CCAA:TGTG
194 This above syntax means that sample 1 is encoded with ATAT barcode at read_1 AND GAGG
195 barcode at read_2. Note that you can still combine barcodes using | e.g.
196 sample1 ATAT|GAGG:CCAA|TGTG
197 would mean that sample 1 is mapped by the combination of barcode: ATAT OR GAGG at read_1
198 AND CCAA OR TGTG at read_2.
199 Extended barcode file format : 3 (single-end) or 4 (paired-end) tab-delimited colums
200 same as the simple barcode file format but the extra columns contains the file name(s)
201 to use to name output files. A unique extra column is expected for single-end while 2
202 extra columns are expected for paired-end. In case, lines are duplicated (multiple
203 barcodesmapping the same sample), the same file name should be indicated in the third
204 (and fourth) column(s).
205 sample1 ATAT spl1_1.txt.gz spl1_2.txt.gz
206 sample1 GAGG spl1_1.txt.gz spl1_2.txt.gz
207 sample2 CCAA spl2_1.txt.gz spl2_2.txt.gz
208 Or
209 sample1 ATAT|GAGG:CCAA|TGTG spl1_1.txt.gz spl1_2.txt.gz
210 Ns in barcode sequence are allowed and are used to flag positions that should be ignored
211 in sample matching
212 i.e. they will be clipped off the read sequence (like in iCLIP protocol).
213
214 Required.
215
216 BARCODE_READ_POS=BarcodePosition
217 BPOS=BarcodePosition
218
219 Indicates the location of additional barcodes present in the read(s). Setting this option
220 implies setting the LEN option.
221 Importantly, these additional barcodes must not encode sample identity information but
222 used for e.g. molecular barcoding (UMIs) or for any purpose other than sample identity encoding.
223
224 Default value: BOTH. This option can be set to 'null' to clear the default value.
225 Possible values: {READ_1, READ_2, BOTH, NONE}
226
227 BCLEN=String
228 LEN=String
229
230 Length of the barcode sequences, optional. Taken from barcode file when not given.
231 In situations where BARCODE_READ_POS == BOTH AND REDUNDANT_BARCODES=false, two distinct
232 length can be provided using the syntax LEN=X:Z where X and Z are 2 integers representing
233 the barcode length for read_1 and read_2 respectively.
234
235 Default value: null
236
237 REDUNDANT_BARCODES=Boolean
238 BRED=Boolean
239
240 This option only applies for paired-end data with *both* INDEX_FILE1 and INDEX_FILE2
241 provided.
242 Indicates if both index barcodes encode redundant information i.e. if both barcodes are
243 supposed to be identical (or resolve to the same sample when a pool of barcodes is used
244 per sample).
245 When BRED=true, the STRICT option guides the sample lookup behavior When BRED=false,
246 barcodes are combined prior to sample lookup.
247
248 Default value: true. This option can be set to 'null' to clear the default value.
249 Possible values: {true, false}
250
251 STRICT=Boolean
252 S=Boolean
253
254 For paired-end data and when two distinct barcodes/indices are used to encode samples,
255 this option tells if both barcodes should resolve to the same sample.
256 When true and if only one of the two reads has a barcode match, the read pair is
257 'unassigned'.
258 When false and if only one of the two reads has a barcode match, the read pair is
259 assigned to the
260 corresponding sample
261 When reads resolve to different samples, the read pair is always 'unassigned'.
262
263 Default value: false. This option can be set to 'null' to clear the default value.
264 Possible values: {true, false}
265
266 MAX_MISMATCHES=String
267 MM=String
268
269 Maximum mismatches for a barcode to be considered a match. In situations where both
270 barcodes are used for sample matching i.e. BPOS=BOTH BM=BOTH (or 2 INDEX_FILE given), two
271 distinct
272 values can be given here using the syntax MM=X:Z where X and Z are 2 integers to use for
273 read_1 and read_2 respectively.
274 MM=null is like MM=0
275
276 Default value: 1. This option can be set to 'null' to clear the default value.
277
278 MIN_MISMATCH_DELTA=String
279 MMD=String
280
281 Minimum difference between the number of mismatches against the best and the second best
282 barcode. When MMD is not respected, the read remains unassigned.
283 When two distinct barcodes are used for sample matching (dual encoding), two distinct
284 values can be given using the syntax MMD=X:Z where X and Z are 2 integers to use for
285 first (e.g. from read_1 or index_1)
286 MMD=null is like MMD=0
287
288 Default value: 1. This option can be set to 'null' to clear the default value.
289
290 MIN_BASE_QUALITY=String
291 Q=String
292
293 Minimum base quality during barcode matching: bases which quality is less than this
294 cutoff are always considered as a mismatch.When two distinct barcodes are used for sample
295 matching (dual encoding), two distinct values can be given using the syntax Q=X:Z where X
296 and Z are 2 integers to use for first (e.g. from read_1 or index_1) and second barcode
297 (e.g. from read_2 or index_2) respectively.
298 Q=null is like Q=0.
299
300 Default value: 10. This option can be set to 'null' to clear the default value.
301
302 XTRIMLEN=String
303 XT=String
304
305 Optional extra number of base to be trimmed right after the barcode (only used if
306 CLIP_BARCODE=true).
307 When running paired-end, two distinct values can be given using the syntax XT=X:Z where X
308 and Z are 2 integers to use for read_1 and read_2 respectively. Note that even when
309 BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode as to
310 end up with reads of the same length (note that this can also be operated using ZT). If a
311 unique value is given, e.g. XT=1, while running paired-end the following rule applies:
312 (1) BPOS=READ_1 or BPOS=READ_2, no trim is applied at the read w/o barcode
313 (2) BPOS=BOTH, the value is used for both reads.
314
315 Note that XT=null is like XT=0.
316 Default value: 0. This option can be set to 'null' to clear the default value.
317
318 ZTRIMLEN=String
319 ZT=String
320
321 Optional extra number of bases to be trimmed from the read end i.e. 3' end.
322 When running paired-end, two distinct values can be given here using the syntax ZT=X:Z
323 where X and Z are 2 integers to use for read_1 and read_2 respectively. Note that even
324 when BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode
325 as to end up with reads of the same length (note that this can also be operated using
326 XT). Note that if a single value is passed, the value always applies to both reads in
327 paired-end mode without further consideration.
328 ZT=null is like ZT=0.
329
330 Default value: 0. This option can be set to 'null' to clear the default value.
331
332 CLIP_BARCODE=Boolean
333 C=Boolean
334
335 Clip barcode sequence from read sequence, as well as XTRIMLEN (and ZTRIMLEN) bases if
336 applicable, before writing to output file.
337 If false, reads are written without modification to output file.
338 Apply to both barcodes when BPOS=BOTH.
339
340 Default value: true. This option can be set to 'null' to clear the default value.
341 Possible values: {true, false}
342
343 ADD_BARCODE_TO_HEADER=Boolean
344 ADD=Boolean
345
346 Add matched barcode at the end of the read header. Applies to both index when INDEX_FILE2
347 is also provided.
348 First the sample encoding barcodes from I1 (and I2 when relevant) are added to the read
349 headers like
350 @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:I1_BARCODE:I2_BARCODE
351 Then, if BPOS!=NONE, the additional barcodes (UMIs) clipped from the read(s) are added
352 to their own header, like
353 @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:I1_BARCODE:I2_BARCODE:CLIPPED_SEQ_FROMREAD
354
355 Default value: true. This option can be set to 'null' to clear the default value.
356 Possible values: {true, false}
357
358 ENSURE_IDENTICAL_HEADER_NAMES=Boolean
359 SAME_HEADERS=Boolean
360
361 Makes sure that headers of both reads of a pair are identical, using the following read
362 header pattern (for both reads of a pair):
363 @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 I1_BARCODE:I2_BARCODE(:CLIPPED_SEQ_FROMREAD1:CLIPPED_SEQ_FROMREAD2)
364 This option only makes sense in paired end mode and ADD=true. Some (if not all) mappers
365 will indeed complain when the read headers are not identical. When molecular barcodes are
366 present in reads and the RCHAR is used, you will end with (problematic) read headers like
367 this:
368 HISEQ:44:C6KC0ANXX:5:1101:1491:1994:1:N:0:TAGAACAC:TGGAGTAG
369 HISEQ:44:C6KC0ANXX:5:1101:1491:1994:3:N:0:TAGAACAC:CGTTGTAT
370 SAME_HEADERS=true will instead genetates the following identical header for both reads:
371 HISEQ:44:C6KC0ANXX:5:1101:1491:1994:TAGAACAC:TGGAGTAG:CGTTGTAT
372 Note that we also clipped the useless '1:N:0' and '3:N:0' has they will also result in
373 generating different headers
374 Important: this option will force RCHAR=: UNLESS you specify RCHAR=null ; in which
375 case a space will be preserved ie:
376 HISEQ:44:C6KC0ANXX:5:1101:1491:1994 TAGAACAC:TGGAGTAG:CGTTGTAT
377
378 Default value: true. This option can be set to 'null' to clear the default value.
379 Possible values: {true, false}
380
381 READ_NAME_REPLACE_CHAR=String
382 RCHAR=String
383
384 Replace spaces in read name/header using provided character. This is particularly handy
385 when you need to retain ADDed barcode in read name/header during mapping (everything
386 after space in read name is usually clipped in BAM files). For example, with RCHAR=':':
387 '@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:'
388 becomes
389 '@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965:2:N:0:BARCODE'
390 Default value: null.
391
392 QUALITY_FORMAT=FastqQualityFormat
393 V=FastqQualityFormat
394
395 A value describing how the quality values are encoded in the fastq. Either 'Solexa' for
396 pre-pipeline 1.3 style scores (solexa scaling + 66), 'Illumina' for pipeline 1.3 and
397 above (phred scaling + 64) or 'Standard' for phred scaled scores with a character shift
398 of 33. If this value is not specified (or 'null' is given), the quality format will be
399 detected.
400
401 Default value: Standard. This option can be set to 'null' to clear the default value.
402 Possible values: {Solexa, Illumina, Standard}
403
404 KEEP_UNASSIGNED_READ=Boolean
405 UN=Boolean
406
407 Should un-assigned reads be saved in files or simply ignored. File names are
408 automatically created or can be given using UF1 & UF2 options.
409
410 Default value: true. This option can be set to 'null' to clear the default value.
411 Possible values: {true, false}
412
413 BARCODE_DIAG_FILE=String
414 DIAG=String
415
416 Name for a barcode match reporting file (not generated by default).Either a name (in
417 which case the file will be created in the output dir) or full path. This file will
418 contain a line per read pair with the barcode best matching the read subsequence or
419 'null' when no match is found according to matching parameters ; and the final selected
420 sample. This file is useful for debugging or further processing in case both ends are
421 barcoded.
422 N.B: this file will have a size of about one of the fastq input files.
423
424 Default value: null.
425 ]]>
426 </help>
427
428 </tool>