0
|
1 <tool id="je_demultiplex_illu" name="Je-Demultiplex-Illu" version="1.0">
|
|
2 <description>demultiplexes fastq files using Illumina Index file</description>
|
|
3 <macros>
|
|
4 <import>macros.xml</import>
|
|
5 </macros>
|
|
6 <stdio>
|
|
7 <exit_code range="1:" level="fatal" description="Tool exception" />
|
|
8 </stdio>
|
|
9 <version_command>echo '1.0'</version_command>
|
|
10 <command interpreter="bash">
|
|
11 <![CDATA[
|
|
12 je demultiplex-illu
|
|
13
|
|
14 ## Fastq inputs
|
|
15 @single_or_paired_illu_cmd@
|
|
16
|
|
17 @barcode_option_cmd@
|
|
18
|
|
19 #if str($INTERNAL_BARCODES_CON.INTERNAL_BARCODES) == 'true':
|
|
20 BPOS=${INTERNAL_BARCODES_CON.BPOS}
|
|
21 C=${INTERNAL_BARCODES_CON.CLIP_BARCODE}
|
|
22 #if str( $INTERNAL_BARCODES_CON.LEN ) != "":
|
|
23 BCLEN=$INTERNAL_BARCODES_CON.LEN
|
|
24 #end if
|
|
25 #else:
|
|
26 BPOS=NONE
|
|
27 C=false
|
|
28 #end if
|
|
29
|
|
30 @common_options_cmd@
|
|
31
|
|
32 @demultiplexer_common_output_options_cmd@
|
|
33 @demultiplexer_common_outputs_cmd@
|
|
34 ]]>
|
|
35 </command>
|
|
36 <configfiles>
|
|
37 <expand macro="barcode_config_file"/>
|
|
38 </configfiles>
|
|
39 <inputs>
|
|
40 <!-- single/paired - similar to macro 'single_or_paired_general' -->
|
|
41 <expand macro="single_or_paired_illu">
|
|
42 <expand macro="demultiplex_illu_paired_end_options"/>
|
|
43 </expand>
|
|
44 <expand macro="barcode_option"/>
|
|
45 <conditional name="INTERNAL_BARCODES_CON">
|
|
46 <param name="INTERNAL_BARCODES" type="select"
|
|
47 label="Do your reads contain Unique Molecular Identifiers(UMIs)">
|
|
48 <option value="true">Yes</option>
|
|
49 <option value="false" selected="true">No</option>
|
|
50 </param>
|
|
51 <when value="true">
|
|
52 <param name="BPOS" type="select" label="Barcode read position (BPOS)" help="where are the barcodes.
|
|
53 If not using paired-end it does not matter what you specify here.">
|
|
54 <option value="READ_1" selected="true">READ_1 (beginning of read from the first fastq file)</option>
|
|
55 <option value="READ_2">READ_2 (beginning of read from the second fastq file)</option>
|
|
56 <option value="BOTH">BOTH (beginning of both reads)</option>
|
|
57 </param>
|
|
58 <expand macro="barcode_len_option"/>
|
|
59 <expand macro="clip_barcode"/>
|
|
60 </when>
|
|
61 <when value="false"/>
|
|
62 </conditional>
|
|
63
|
|
64 <expand macro="demultiplexer_common_options"/>
|
|
65
|
|
66 <expand macro="common_options"/>
|
|
67
|
|
68 <expand macro="demultiplexer_common_output_options"/>
|
|
69
|
|
70 </inputs>
|
|
71 <outputs>
|
|
72 <expand macro="demultiplexer_common_outputs"/>
|
|
73 </outputs>
|
|
74
|
|
75 <tests>
|
|
76 <test>
|
|
77 <!-- barcode at both ends, non-redundant -->
|
|
78 <param name="type" value="paired"/>
|
|
79 <param name="input_1" value="illu_file_1_sequence.txt" ftype="fastqsanger"/>
|
|
80 <param name="input_2" value="illu_file_2_sequence.txt" ftype="fastqsanger"/>
|
|
81 <param name="I1" value="illu_file_1_index.txt" ftype="fastqsanger"/>
|
|
82 <param name="I2_AVAILABLE" value="true"/>
|
|
83 <param name="I2" value="illu_file_2_index.txt" ftype="fastqsanger"/>
|
|
84
|
|
85 <param name="INTERNAL_BARCODES" value="true"/>
|
|
86 <param name="barcode_list_type_con" value="file"/>
|
|
87 <param name="BARCODE_FILE" value="illu_dualindexing.txt" ftype="tabular"/>
|
|
88 <param name="LEN" value="8:8"/>
|
|
89 <param name="ZT" value="5:6"/>
|
|
90 <param name="BPOS" value="BOTH"/>
|
|
91 <param name="BM" value="BOTH"/>
|
|
92 <param name="BRED" value="false"/>
|
|
93 <param name="MM" value="3"/>
|
|
94 <param name="MMD" value="2"/>
|
|
95 <param name="Q" value="20"/>
|
|
96 <param name="DIAG" value="false"/>
|
|
97 <output name="METRICS_FILE_NAME" file="illu_summary_PE.txt" ftype="tabular" lines_diff="4">
|
|
98 <discovered_dataset designation="unassigned_1" file="illu_unassigned_1_PE.txt" />
|
|
99 <discovered_dataset designation="unassigned_2" file="illu_unassigned_2_PE.txt" />
|
|
100 <discovered_dataset designation="emb681m5_GGACTCCTCTCTCTAT_2" file="emb681m5_GGACTCCTCTCTCTAT_2.txt"/>
|
|
101 <discovered_dataset designation="emb681m5_GGACTCCTCTCTCTAT_1" file="emb681m5_GGACTCCTCTCTCTAT_1.txt"/>
|
|
102 <discovered_dataset designation="emb681m4_TCCTGAGCCTCTCTAT_2" file="emb681m4_TCCTGAGCCTCTCTAT_2.txt"/>
|
|
103 <discovered_dataset designation="emb681m4_TCCTGAGCCTCTCTAT_1" file="emb681m4_TCCTGAGCCTCTCTAT_1.txt"/>
|
|
104 <discovered_dataset designation="emb681m1_TAAGGCGACTCTCTAT_2" file="emb681m1_TAAGGCGACTCTCTAT_2.txt"/>
|
|
105 <discovered_dataset designation="emb681m1_TAAGGCGACTCTCTAT_1" file="emb681m1_TAAGGCGACTCTCTAT_1.txt"/>
|
|
106 <discovered_dataset designation="emb6801m2_AGGCAGAATAGATCGC_2" file="emb6801m2_AGGCAGAATAGATCGC_2.txt"/>
|
|
107 <discovered_dataset designation="emb6801m2_AGGCAGAATAGATCGC_1" file="emb6801m2_AGGCAGAATAGATCGC_1.txt"/>
|
|
108 <discovered_dataset designation="emb6801m1_CGTACTAGTAGATCGC_2" file="emb6801m1_CGTACTAGTAGATCGC_2.txt"/>
|
|
109 <discovered_dataset designation="emb6801m1_CGTACTAGTAGATCGC_1" file="emb6801m1_CGTACTAGTAGATCGC_1.txt"/>
|
|
110 </output>
|
|
111 </test>
|
|
112 </tests>
|
|
113
|
|
114
|
|
115 <help>
|
|
116 <![CDATA[
|
|
117 **What it does**
|
|
118
|
|
119 Je demultiplex-illu: demultiplex fastq files using Illumina Index files,
|
|
120 with optional handling of Unique Molecular Identifiers for further use in 'markdupes' module
|
|
121 Input files are fastq files, and can be in gzip compressed format.
|
|
122
|
|
123 Author: Charles Girardot (charles.girardot@embl.de).
|
|
124
|
|
125 Wrapper by: Jelle Scholtalbers (jelle.scholtalbers@embl.de).
|
|
126
|
|
127 ------
|
|
128
|
|
129 **Know what you are doing**
|
|
130
|
|
131 .. class:: warningmark
|
|
132
|
|
133 You will want to read the `documentation`__.
|
|
134
|
|
135 .. __: http://gbcs.embl.de/portal/Je
|
|
136
|
|
137 ------
|
|
138
|
|
139 **Parameter list**
|
|
140
|
|
141 This is an exhaustive list of options::
|
|
142
|
|
143 FASTQ_FILE1=File
|
|
144 F1=File
|
|
145
|
|
146 Input fastq file (optionally gzipped) for single end data, or first read in paired end
|
|
147 data.
|
|
148
|
|
149 Required.
|
|
150
|
|
151 FASTQ_FILE2=File
|
|
152 F2=File
|
|
153
|
|
154 Input fastq file (optionally gzipped) for the second read of paired end data.
|
|
155
|
|
156 Default value: null.
|
|
157
|
|
158 INDEX_FILE1=File
|
|
159 I1=File
|
|
160
|
|
161 Fastq file for index 1 (barcode) reads, optionally gzipped
|
|
162
|
|
163 Required.
|
|
164
|
|
165 INDEX_FILE2=File
|
|
166 I2=File
|
|
167
|
|
168 Fastq file for index 2 (barcode) reads, optionally gzipped.
|
|
169 A INDEX_FILE1 MUST be provided when INDEX_FILE2 is given. This situation corresponds to
|
|
170 Illumina dual indexing.
|
|
171
|
|
172 Default value: null.
|
|
173
|
|
174 BARCODE_FILE=File
|
|
175 BF=File
|
|
176
|
|
177 Barcode file describing sequence list and sample names. Tab-delimited file with 2
|
|
178 columns, with the sample in col1 and the corresponding barcode in col2.
|
|
179 Simple barcode file format : 2 tab-delimited colums
|
|
180 If multiple barcode map to the same sample, either line can be duplicated e.g.
|
|
181 sample1 ATAT
|
|
182 sample1 GAGG
|
|
183 sample2 CCAA
|
|
184 sample2 TGTG
|
|
185 Or barcodes can be combined using the OR operator '|' i.e. the file above can be
|
|
186 re-written like
|
|
187 sample1 ATAT|GAGG
|
|
188 sample2 CCAA|TGTG
|
|
189 Finally, for the special situation of paired-end data in which barcodes differ at both
|
|
190 ends (ie BPOS=BOTH BRED=false BM=BOTH , see BRED option description), barcodes for read_1
|
|
191 and read_2 can be distinguished using a ':' separator i.e.
|
|
192 sample1 ATAT:GAGG
|
|
193 sample2 CCAA:TGTG
|
|
194 This above syntax means that sample 1 is encoded with ATAT barcode at read_1 AND GAGG
|
|
195 barcode at read_2. Note that you can still combine barcodes using | e.g.
|
|
196 sample1 ATAT|GAGG:CCAA|TGTG
|
|
197 would mean that sample 1 is mapped by the combination of barcode: ATAT OR GAGG at read_1
|
|
198 AND CCAA OR TGTG at read_2.
|
|
199 Extended barcode file format : 3 (single-end) or 4 (paired-end) tab-delimited colums
|
|
200 same as the simple barcode file format but the extra columns contains the file name(s)
|
|
201 to use to name output files. A unique extra column is expected for single-end while 2
|
|
202 extra columns are expected for paired-end. In case, lines are duplicated (multiple
|
|
203 barcodesmapping the same sample), the same file name should be indicated in the third
|
|
204 (and fourth) column(s).
|
|
205 sample1 ATAT spl1_1.txt.gz spl1_2.txt.gz
|
|
206 sample1 GAGG spl1_1.txt.gz spl1_2.txt.gz
|
|
207 sample2 CCAA spl2_1.txt.gz spl2_2.txt.gz
|
|
208 Or
|
|
209 sample1 ATAT|GAGG:CCAA|TGTG spl1_1.txt.gz spl1_2.txt.gz
|
|
210 Ns in barcode sequence are allowed and are used to flag positions that should be ignored
|
|
211 in sample matching
|
|
212 i.e. they will be clipped off the read sequence (like in iCLIP protocol).
|
|
213
|
|
214 Required.
|
|
215
|
|
216 BARCODE_READ_POS=BarcodePosition
|
|
217 BPOS=BarcodePosition
|
|
218
|
|
219 Indicates the location of additional barcodes present in the read(s). Setting this option
|
|
220 implies setting the LEN option.
|
|
221 Importantly, these additional barcodes must not encode sample identity information but
|
|
222 used for e.g. molecular barcoding (UMIs) or for any purpose other than sample identity encoding.
|
|
223
|
|
224 Default value: BOTH. This option can be set to 'null' to clear the default value.
|
|
225 Possible values: {READ_1, READ_2, BOTH, NONE}
|
|
226
|
|
227 BCLEN=String
|
|
228 LEN=String
|
|
229
|
|
230 Length of the barcode sequences, optional. Taken from barcode file when not given.
|
|
231 In situations where BARCODE_READ_POS == BOTH AND REDUNDANT_BARCODES=false, two distinct
|
|
232 length can be provided using the syntax LEN=X:Z where X and Z are 2 integers representing
|
|
233 the barcode length for read_1 and read_2 respectively.
|
|
234
|
|
235 Default value: null
|
|
236
|
|
237 REDUNDANT_BARCODES=Boolean
|
|
238 BRED=Boolean
|
|
239
|
|
240 This option only applies for paired-end data with *both* INDEX_FILE1 and INDEX_FILE2
|
|
241 provided.
|
|
242 Indicates if both index barcodes encode redundant information i.e. if both barcodes are
|
|
243 supposed to be identical (or resolve to the same sample when a pool of barcodes is used
|
|
244 per sample).
|
|
245 When BRED=true, the STRICT option guides the sample lookup behavior When BRED=false,
|
|
246 barcodes are combined prior to sample lookup.
|
|
247
|
|
248 Default value: true. This option can be set to 'null' to clear the default value.
|
|
249 Possible values: {true, false}
|
|
250
|
|
251 STRICT=Boolean
|
|
252 S=Boolean
|
|
253
|
|
254 For paired-end data and when two distinct barcodes/indices are used to encode samples,
|
|
255 this option tells if both barcodes should resolve to the same sample.
|
|
256 When true and if only one of the two reads has a barcode match, the read pair is
|
|
257 'unassigned'.
|
|
258 When false and if only one of the two reads has a barcode match, the read pair is
|
|
259 assigned to the
|
|
260 corresponding sample
|
|
261 When reads resolve to different samples, the read pair is always 'unassigned'.
|
|
262
|
|
263 Default value: false. This option can be set to 'null' to clear the default value.
|
|
264 Possible values: {true, false}
|
|
265
|
|
266 MAX_MISMATCHES=String
|
|
267 MM=String
|
|
268
|
|
269 Maximum mismatches for a barcode to be considered a match. In situations where both
|
|
270 barcodes are used for sample matching i.e. BPOS=BOTH BM=BOTH (or 2 INDEX_FILE given), two
|
|
271 distinct
|
|
272 values can be given here using the syntax MM=X:Z where X and Z are 2 integers to use for
|
|
273 read_1 and read_2 respectively.
|
|
274 MM=null is like MM=0
|
|
275
|
|
276 Default value: 1. This option can be set to 'null' to clear the default value.
|
|
277
|
|
278 MIN_MISMATCH_DELTA=String
|
|
279 MMD=String
|
|
280
|
|
281 Minimum difference between the number of mismatches against the best and the second best
|
|
282 barcode. When MMD is not respected, the read remains unassigned.
|
|
283 When two distinct barcodes are used for sample matching (dual encoding), two distinct
|
|
284 values can be given using the syntax MMD=X:Z where X and Z are 2 integers to use for
|
|
285 first (e.g. from read_1 or index_1)
|
|
286 MMD=null is like MMD=0
|
|
287
|
|
288 Default value: 1. This option can be set to 'null' to clear the default value.
|
|
289
|
|
290 MIN_BASE_QUALITY=String
|
|
291 Q=String
|
|
292
|
|
293 Minimum base quality during barcode matching: bases which quality is less than this
|
|
294 cutoff are always considered as a mismatch.When two distinct barcodes are used for sample
|
|
295 matching (dual encoding), two distinct values can be given using the syntax Q=X:Z where X
|
|
296 and Z are 2 integers to use for first (e.g. from read_1 or index_1) and second barcode
|
|
297 (e.g. from read_2 or index_2) respectively.
|
|
298 Q=null is like Q=0.
|
|
299
|
|
300 Default value: 10. This option can be set to 'null' to clear the default value.
|
|
301
|
|
302 XTRIMLEN=String
|
|
303 XT=String
|
|
304
|
|
305 Optional extra number of base to be trimmed right after the barcode (only used if
|
|
306 CLIP_BARCODE=true).
|
|
307 When running paired-end, two distinct values can be given using the syntax XT=X:Z where X
|
|
308 and Z are 2 integers to use for read_1 and read_2 respectively. Note that even when
|
|
309 BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode as to
|
|
310 end up with reads of the same length (note that this can also be operated using ZT). If a
|
|
311 unique value is given, e.g. XT=1, while running paired-end the following rule applies:
|
|
312 (1) BPOS=READ_1 or BPOS=READ_2, no trim is applied at the read w/o barcode
|
|
313 (2) BPOS=BOTH, the value is used for both reads.
|
|
314
|
|
315 Note that XT=null is like XT=0.
|
|
316 Default value: 0. This option can be set to 'null' to clear the default value.
|
|
317
|
|
318 ZTRIMLEN=String
|
|
319 ZT=String
|
|
320
|
|
321 Optional extra number of bases to be trimmed from the read end i.e. 3' end.
|
|
322 When running paired-end, two distinct values can be given here using the syntax ZT=X:Z
|
|
323 where X and Z are 2 integers to use for read_1 and read_2 respectively. Note that even
|
|
324 when BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode
|
|
325 as to end up with reads of the same length (note that this can also be operated using
|
|
326 XT). Note that if a single value is passed, the value always applies to both reads in
|
|
327 paired-end mode without further consideration.
|
|
328 ZT=null is like ZT=0.
|
|
329
|
|
330 Default value: 0. This option can be set to 'null' to clear the default value.
|
|
331
|
|
332 CLIP_BARCODE=Boolean
|
|
333 C=Boolean
|
|
334
|
|
335 Clip barcode sequence from read sequence, as well as XTRIMLEN (and ZTRIMLEN) bases if
|
|
336 applicable, before writing to output file.
|
|
337 If false, reads are written without modification to output file.
|
|
338 Apply to both barcodes when BPOS=BOTH.
|
|
339
|
|
340 Default value: true. This option can be set to 'null' to clear the default value.
|
|
341 Possible values: {true, false}
|
|
342
|
|
343 ADD_BARCODE_TO_HEADER=Boolean
|
|
344 ADD=Boolean
|
|
345
|
|
346 Add matched barcode at the end of the read header. Applies to both index when INDEX_FILE2
|
|
347 is also provided.
|
|
348 First the sample encoding barcodes from I1 (and I2 when relevant) are added to the read
|
|
349 headers like
|
|
350 @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:I1_BARCODE:I2_BARCODE
|
|
351 Then, if BPOS!=NONE, the additional barcodes (UMIs) clipped from the read(s) are added
|
|
352 to their own header, like
|
|
353 @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:I1_BARCODE:I2_BARCODE:CLIPPED_SEQ_FROMREAD
|
|
354
|
|
355 Default value: true. This option can be set to 'null' to clear the default value.
|
|
356 Possible values: {true, false}
|
|
357
|
|
358 ENSURE_IDENTICAL_HEADER_NAMES=Boolean
|
|
359 SAME_HEADERS=Boolean
|
|
360
|
|
361 Makes sure that headers of both reads of a pair are identical, using the following read
|
|
362 header pattern (for both reads of a pair):
|
|
363 @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 I1_BARCODE:I2_BARCODE(:CLIPPED_SEQ_FROMREAD1:CLIPPED_SEQ_FROMREAD2)
|
|
364 This option only makes sense in paired end mode and ADD=true. Some (if not all) mappers
|
|
365 will indeed complain when the read headers are not identical. When molecular barcodes are
|
|
366 present in reads and the RCHAR is used, you will end with (problematic) read headers like
|
|
367 this:
|
|
368 HISEQ:44:C6KC0ANXX:5:1101:1491:1994:1:N:0:TAGAACAC:TGGAGTAG
|
|
369 HISEQ:44:C6KC0ANXX:5:1101:1491:1994:3:N:0:TAGAACAC:CGTTGTAT
|
|
370 SAME_HEADERS=true will instead genetates the following identical header for both reads:
|
|
371 HISEQ:44:C6KC0ANXX:5:1101:1491:1994:TAGAACAC:TGGAGTAG:CGTTGTAT
|
|
372 Note that we also clipped the useless '1:N:0' and '3:N:0' has they will also result in
|
|
373 generating different headers
|
|
374 Important: this option will force RCHAR=: UNLESS you specify RCHAR=null ; in which
|
|
375 case a space will be preserved ie:
|
|
376 HISEQ:44:C6KC0ANXX:5:1101:1491:1994 TAGAACAC:TGGAGTAG:CGTTGTAT
|
|
377
|
|
378 Default value: true. This option can be set to 'null' to clear the default value.
|
|
379 Possible values: {true, false}
|
|
380
|
|
381 READ_NAME_REPLACE_CHAR=String
|
|
382 RCHAR=String
|
|
383
|
|
384 Replace spaces in read name/header using provided character. This is particularly handy
|
|
385 when you need to retain ADDed barcode in read name/header during mapping (everything
|
|
386 after space in read name is usually clipped in BAM files). For example, with RCHAR=':':
|
|
387 '@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:'
|
|
388 becomes
|
|
389 '@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965:2:N:0:BARCODE'
|
|
390 Default value: null.
|
|
391
|
|
392 QUALITY_FORMAT=FastqQualityFormat
|
|
393 V=FastqQualityFormat
|
|
394
|
|
395 A value describing how the quality values are encoded in the fastq. Either 'Solexa' for
|
|
396 pre-pipeline 1.3 style scores (solexa scaling + 66), 'Illumina' for pipeline 1.3 and
|
|
397 above (phred scaling + 64) or 'Standard' for phred scaled scores with a character shift
|
|
398 of 33. If this value is not specified (or 'null' is given), the quality format will be
|
|
399 detected.
|
|
400
|
|
401 Default value: Standard. This option can be set to 'null' to clear the default value.
|
|
402 Possible values: {Solexa, Illumina, Standard}
|
|
403
|
|
404 KEEP_UNASSIGNED_READ=Boolean
|
|
405 UN=Boolean
|
|
406
|
|
407 Should un-assigned reads be saved in files or simply ignored. File names are
|
|
408 automatically created or can be given using UF1 & UF2 options.
|
|
409
|
|
410 Default value: true. This option can be set to 'null' to clear the default value.
|
|
411 Possible values: {true, false}
|
|
412
|
|
413 BARCODE_DIAG_FILE=String
|
|
414 DIAG=String
|
|
415
|
|
416 Name for a barcode match reporting file (not generated by default).Either a name (in
|
|
417 which case the file will be created in the output dir) or full path. This file will
|
|
418 contain a line per read pair with the barcode best matching the read subsequence or
|
|
419 'null' when no match is found according to matching parameters ; and the final selected
|
|
420 sample. This file is useful for debugging or further processing in case both ends are
|
|
421 barcoded.
|
|
422 N.B: this file will have a size of about one of the fastq input files.
|
|
423
|
|
424 Default value: null.
|
|
425 ]]>
|
|
426 </help>
|
|
427
|
|
428 </tool>
|