Mercurial > repos > gbcs-embl-heidelberg > je_demultiplex_illu
view je-demultiplex-illu.xml @ 5:69c77f9fc064 draft
planemo upload for repository https://git.embl.de/grp-gbcs/Je/tree/master/src/galaxy commit 0eefd837333dae6fbecaf4f55b053268d844eff6
author | gbcs-embl-heidelberg |
---|---|
date | Wed, 02 Aug 2017 10:59:09 -0400 |
parents | 01fdc6d10660 |
children | 370d9764f670 |
line wrap: on
line source
<tool id="je_demultiplex_illu" name="Je-Demultiplex-Illu" version="@VERSION_STRING@"> <description>demultiplexes fastq files using Illumina Index file</description> <macros> <import>macros.xml</import> </macros> <expand macro="requirements" /> <stdio> <exit_code range="1:" level="fatal" description="Tool exception" /> </stdio> <expand macro="version_command" /> <command> <![CDATA[ je demultiplex-illu ## Fastq inputs @single_or_paired_illu_cmd@ @barcode_option_cmd@ #if str($INTERNAL_BARCODES_CON.INTERNAL_BARCODES) == 'true': BPOS=${INTERNAL_BARCODES_CON.BPOS} C=${INTERNAL_BARCODES_CON.CLIP_BARCODE} #if str( $INTERNAL_BARCODES_CON.LEN ) != "": BCLEN=$INTERNAL_BARCODES_CON.LEN #end if #else: BPOS=NONE C=false #end if @common_options_cmd@ @demultiplexer_common_output_options_cmd@ @demultiplexer_common_outputs_cmd@ ]]> </command> <configfiles> <expand macro="barcode_config_file"/> </configfiles> <inputs> <!-- single/paired - similar to macro 'single_or_paired_general' --> <expand macro="single_or_paired_illu"> <expand macro="demultiplex_illu_paired_end_options"/> </expand> <expand macro="barcode_option"/> <conditional name="INTERNAL_BARCODES_CON"> <param name="INTERNAL_BARCODES" type="select" label="Do your reads contain Unique Molecular Identifiers(UMIs)"> <option value="true">Yes</option> <option value="false" selected="true">No</option> </param> <when value="true"> <param name="BPOS" type="select" label="Barcode read position (BPOS)" help="where are the barcodes. If not using paired-end it does not matter what you specify here."> <option value="READ_1" selected="true">READ_1 (beginning of read from the first fastq file)</option> <option value="READ_2">READ_2 (beginning of read from the second fastq file)</option> <option value="BOTH">BOTH (beginning of both reads)</option> </param> <expand macro="barcode_len_option"/> <expand macro="clip_barcode"/> </when> <when value="false"/> </conditional> <expand macro="demultiplexer_common_options"/> <expand macro="common_options"/> <expand macro="demultiplexer_common_output_options"/> </inputs> <outputs> <expand macro="demultiplexer_common_outputs"/> </outputs> <tests> <test> <!-- barcode at both ends, non-redundant --> <param name="type" value="paired"/> <param name="input_1" value="illu_file_1_sequence.txt" ftype="fastqsanger"/> <param name="input_2" value="illu_file_2_sequence.txt" ftype="fastqsanger"/> <param name="I1" value="illu_file_1_index.txt" ftype="fastqsanger"/> <param name="I2_AVAILABLE" value="true"/> <param name="I2" value="illu_file_2_index.txt" ftype="fastqsanger"/> <param name="INTERNAL_BARCODES" value="true"/> <param name="barcode_list_type_con" value="file"/> <param name="BARCODE_FILE" value="illu_dualindexing.txt" ftype="tabular"/> <param name="LEN" value="8:8"/> <param name="ZT" value="5:6"/> <param name="BPOS" value="BOTH"/> <param name="BM" value="BOTH"/> <param name="BRED" value="false"/> <param name="MM" value="3"/> <param name="MMD" value="2"/> <param name="Q" value="20"/> <param name="DIAG" value="false"/> <output name="METRICS_FILE_NAME" file="illu_summary_PE.txt" ftype="tabular" lines_diff="4"> <discovered_dataset designation="unassigned_1" file="illu_unassigned_1_PE.txt" /> <discovered_dataset designation="unassigned_2" file="illu_unassigned_2_PE.txt" /> <discovered_dataset designation="emb681m5_GGACTCCTCTCTCTAT_2" file="emb681m5_GGACTCCTCTCTCTAT_2.txt"/> <discovered_dataset designation="emb681m5_GGACTCCTCTCTCTAT_1" file="emb681m5_GGACTCCTCTCTCTAT_1.txt"/> <discovered_dataset designation="emb681m4_TCCTGAGCCTCTCTAT_2" file="emb681m4_TCCTGAGCCTCTCTAT_2.txt"/> <discovered_dataset designation="emb681m4_TCCTGAGCCTCTCTAT_1" file="emb681m4_TCCTGAGCCTCTCTAT_1.txt"/> <discovered_dataset designation="emb681m1_TAAGGCGACTCTCTAT_2" file="emb681m1_TAAGGCGACTCTCTAT_2.txt"/> <discovered_dataset designation="emb681m1_TAAGGCGACTCTCTAT_1" file="emb681m1_TAAGGCGACTCTCTAT_1.txt"/> <discovered_dataset designation="emb6801m2_AGGCAGAATAGATCGC_2" file="emb6801m2_AGGCAGAATAGATCGC_2.txt"/> <discovered_dataset designation="emb6801m2_AGGCAGAATAGATCGC_1" file="emb6801m2_AGGCAGAATAGATCGC_1.txt"/> <discovered_dataset designation="emb6801m1_CGTACTAGTAGATCGC_2" file="emb6801m1_CGTACTAGTAGATCGC_2.txt"/> <discovered_dataset designation="emb6801m1_CGTACTAGTAGATCGC_1" file="emb6801m1_CGTACTAGTAGATCGC_1.txt"/> </output> </test> </tests> <help> <![CDATA[ **What it does** Je demultiplex-illu: demultiplex fastq files using Illumina Index files, with optional handling of Unique Molecular Identifiers for further use in 'markdupes' module Input files are fastq files, and can be in gzip compressed format. Author: Charles Girardot (charles.girardot@embl.de). Wrapper by: Jelle Scholtalbers (jelle.scholtalbers@embl.de). ------ **Know what you are doing** .. class:: warningmark You will want to read the `documentation`__. .. __: http://gbcs.embl.de/portal/Je ------ **Parameter list** This is an exhaustive list of options:: FASTQ_FILE1=File F1=File Input fastq file (optionally gzipped) for single end data, or first read in paired end data. Required. FASTQ_FILE2=File F2=File Input fastq file (optionally gzipped) for the second read of paired end data. Default value: null. INDEX_FILE1=File I1=File Fastq file for index 1 (barcode) reads, optionally gzipped Required. INDEX_FILE2=File I2=File Fastq file for index 2 (barcode) reads, optionally gzipped. A INDEX_FILE1 MUST be provided when INDEX_FILE2 is given. This situation corresponds to Illumina dual indexing. Default value: null. BARCODE_FILE=File BF=File Barcode file describing sequence list and sample names. Tab-delimited file with 2 columns, with the sample in col1 and the corresponding barcode in col2. Simple barcode file format : 2 tab-delimited colums If multiple barcode map to the same sample, either line can be duplicated e.g. sample1 ATAT sample1 GAGG sample2 CCAA sample2 TGTG Or barcodes can be combined using the OR operator '|' i.e. the file above can be re-written like sample1 ATAT|GAGG sample2 CCAA|TGTG Finally, for the special situation of paired-end data in which barcodes differ at both ends (ie BPOS=BOTH BRED=false BM=BOTH , see BRED option description), barcodes for read_1 and read_2 can be distinguished using a ':' separator i.e. sample1 ATAT:GAGG sample2 CCAA:TGTG This above syntax means that sample 1 is encoded with ATAT barcode at read_1 AND GAGG barcode at read_2. Note that you can still combine barcodes using | e.g. sample1 ATAT|GAGG:CCAA|TGTG would mean that sample 1 is mapped by the combination of barcode: ATAT OR GAGG at read_1 AND CCAA OR TGTG at read_2. Extended barcode file format : 3 (single-end) or 4 (paired-end) tab-delimited colums same as the simple barcode file format but the extra columns contains the file name(s) to use to name output files. A unique extra column is expected for single-end while 2 extra columns are expected for paired-end. In case, lines are duplicated (multiple barcodesmapping the same sample), the same file name should be indicated in the third (and fourth) column(s). sample1 ATAT spl1_1.txt.gz spl1_2.txt.gz sample1 GAGG spl1_1.txt.gz spl1_2.txt.gz sample2 CCAA spl2_1.txt.gz spl2_2.txt.gz Or sample1 ATAT|GAGG:CCAA|TGTG spl1_1.txt.gz spl1_2.txt.gz Ns in barcode sequence are allowed and are used to flag positions that should be ignored in sample matching i.e. they will be clipped off the read sequence (like in iCLIP protocol). Required. BARCODE_READ_POS=BarcodePosition BPOS=BarcodePosition Indicates the location of additional barcodes present in the read(s). Setting this option implies setting the LEN option. Importantly, these additional barcodes must not encode sample identity information but used for e.g. molecular barcoding (UMIs) or for any purpose other than sample identity encoding. Default value: BOTH. This option can be set to 'null' to clear the default value. Possible values: {READ_1, READ_2, BOTH, NONE} BCLEN=String LEN=String Length of the barcode sequences, optional. Taken from barcode file when not given. In situations where BARCODE_READ_POS == BOTH AND REDUNDANT_BARCODES=false, two distinct length can be provided using the syntax LEN=X:Z where X and Z are 2 integers representing the barcode length for read_1 and read_2 respectively. Default value: null REDUNDANT_BARCODES=Boolean BRED=Boolean This option only applies for paired-end data with *both* INDEX_FILE1 and INDEX_FILE2 provided. Indicates if both index barcodes encode redundant information i.e. if both barcodes are supposed to be identical (or resolve to the same sample when a pool of barcodes is used per sample). When BRED=true, the STRICT option guides the sample lookup behavior When BRED=false, barcodes are combined prior to sample lookup. Default value: true. This option can be set to 'null' to clear the default value. Possible values: {true, false} STRICT=Boolean S=Boolean For paired-end data and when two distinct barcodes/indices are used to encode samples, this option tells if both barcodes should resolve to the same sample. When true and if only one of the two reads has a barcode match, the read pair is 'unassigned'. When false and if only one of the two reads has a barcode match, the read pair is assigned to the corresponding sample When reads resolve to different samples, the read pair is always 'unassigned'. Default value: false. This option can be set to 'null' to clear the default value. Possible values: {true, false} MAX_MISMATCHES=String MM=String Maximum mismatches for a barcode to be considered a match. In situations where both barcodes are used for sample matching i.e. BPOS=BOTH BM=BOTH (or 2 INDEX_FILE given), two distinct values can be given here using the syntax MM=X:Z where X and Z are 2 integers to use for read_1 and read_2 respectively. MM=null is like MM=0 Default value: 1. This option can be set to 'null' to clear the default value. MIN_MISMATCH_DELTA=String MMD=String Minimum difference between the number of mismatches against the best and the second best barcode. When MMD is not respected, the read remains unassigned. When two distinct barcodes are used for sample matching (dual encoding), two distinct values can be given using the syntax MMD=X:Z where X and Z are 2 integers to use for first (e.g. from read_1 or index_1) MMD=null is like MMD=0 Default value: 1. This option can be set to 'null' to clear the default value. MIN_BASE_QUALITY=String Q=String Minimum base quality during barcode matching: bases which quality is less than this cutoff are always considered as a mismatch.When two distinct barcodes are used for sample matching (dual encoding), two distinct values can be given using the syntax Q=X:Z where X and Z are 2 integers to use for first (e.g. from read_1 or index_1) and second barcode (e.g. from read_2 or index_2) respectively. Q=null is like Q=0. Default value: 10. This option can be set to 'null' to clear the default value. XTRIMLEN=String XT=String Optional extra number of base to be trimmed right after the barcode (only used if CLIP_BARCODE=true). When running paired-end, two distinct values can be given using the syntax XT=X:Z where X and Z are 2 integers to use for read_1 and read_2 respectively. Note that even when BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode as to end up with reads of the same length (note that this can also be operated using ZT). If a unique value is given, e.g. XT=1, while running paired-end the following rule applies: (1) BPOS=READ_1 or BPOS=READ_2, no trim is applied at the read w/o barcode (2) BPOS=BOTH, the value is used for both reads. Note that XT=null is like XT=0. Default value: 0. This option can be set to 'null' to clear the default value. ZTRIMLEN=String ZT=String Optional extra number of bases to be trimmed from the read end i.e. 3' end. When running paired-end, two distinct values can be given here using the syntax ZT=X:Z where X and Z are 2 integers to use for read_1 and read_2 respectively. Note that even when BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode as to end up with reads of the same length (note that this can also be operated using XT). Note that if a single value is passed, the value always applies to both reads in paired-end mode without further consideration. ZT=null is like ZT=0. Default value: 0. This option can be set to 'null' to clear the default value. CLIP_BARCODE=Boolean C=Boolean Clip barcode sequence from read sequence, as well as XTRIMLEN (and ZTRIMLEN) bases if applicable, before writing to output file. If false, reads are written without modification to output file. Apply to both barcodes when BPOS=BOTH. Default value: true. This option can be set to 'null' to clear the default value. Possible values: {true, false} ADD_BARCODE_TO_HEADER=Boolean ADD=Boolean Add matched barcode at the end of the read header. Applies to both index when INDEX_FILE2 is also provided. First the sample encoding barcodes from I1 (and I2 when relevant) are added to the read headers like @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:I1_BARCODE:I2_BARCODE Then, if BPOS!=NONE, the additional barcodes (UMIs) clipped from the read(s) are added to their own header, like @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:I1_BARCODE:I2_BARCODE:CLIPPED_SEQ_FROMREAD Default value: true. This option can be set to 'null' to clear the default value. Possible values: {true, false} ENSURE_IDENTICAL_HEADER_NAMES=Boolean SAME_HEADERS=Boolean Makes sure that headers of both reads of a pair are identical, using the following read header pattern (for both reads of a pair): @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 I1_BARCODE:I2_BARCODE(:CLIPPED_SEQ_FROMREAD1:CLIPPED_SEQ_FROMREAD2) This option only makes sense in paired end mode and ADD=true. Some (if not all) mappers will indeed complain when the read headers are not identical. When molecular barcodes are present in reads and the RCHAR is used, you will end with (problematic) read headers like this: HISEQ:44:C6KC0ANXX:5:1101:1491:1994:1:N:0:TAGAACAC:TGGAGTAG HISEQ:44:C6KC0ANXX:5:1101:1491:1994:3:N:0:TAGAACAC:CGTTGTAT SAME_HEADERS=true will instead genetates the following identical header for both reads: HISEQ:44:C6KC0ANXX:5:1101:1491:1994:TAGAACAC:TGGAGTAG:CGTTGTAT Note that we also clipped the useless '1:N:0' and '3:N:0' has they will also result in generating different headers Important: this option will force RCHAR=: UNLESS you specify RCHAR=null ; in which case a space will be preserved ie: HISEQ:44:C6KC0ANXX:5:1101:1491:1994 TAGAACAC:TGGAGTAG:CGTTGTAT Default value: true. This option can be set to 'null' to clear the default value. Possible values: {true, false} READ_NAME_REPLACE_CHAR=String RCHAR=String Replace spaces in read name/header using provided character. This is particularly handy when you need to retain ADDed barcode in read name/header during mapping (everything after space in read name is usually clipped in BAM files). For example, with RCHAR=':': '@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:' becomes '@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965:2:N:0:BARCODE' Default value: null. QUALITY_FORMAT=FastqQualityFormat V=FastqQualityFormat A value describing how the quality values are encoded in the fastq. Either 'Solexa' for pre-pipeline 1.3 style scores (solexa scaling + 66), 'Illumina' for pipeline 1.3 and above (phred scaling + 64) or 'Standard' for phred scaled scores with a character shift of 33. If this value is not specified (or 'null' is given), the quality format will be detected. Default value: Standard. This option can be set to 'null' to clear the default value. Possible values: {Solexa, Illumina, Standard} KEEP_UNASSIGNED_READ=Boolean UN=Boolean Should un-assigned reads be saved in files or simply ignored. File names are automatically created or can be given using UF1 & UF2 options. Default value: true. This option can be set to 'null' to clear the default value. Possible values: {true, false} BARCODE_DIAG_FILE=String DIAG=String Name for a barcode match reporting file (not generated by default).Either a name (in which case the file will be created in the output dir) or full path. This file will contain a line per read pair with the barcode best matching the read subsequence or 'null' when no match is found according to matching parameters ; and the final selected sample. This file is useful for debugging or further processing in case both ends are barcoded. N.B: this file will have a size of about one of the fastq input files. Default value: null. ]]> </help> <expand macro="citations"/> </tool>