Mercurial > repos > gbcs-embl-heidelberg > je_demultiplex
view je-demultiplex.xml @ 8:47c0d9f76232 draft
planemo upload for repository https://git.embl.de/grp-gbcs/Je/tree/master/src/galaxy commit bc2365188e0dd0f20925884276a27914bb714280
author | gbcs-embl-heidelberg |
---|---|
date | Mon, 23 Apr 2018 05:47:17 -0400 |
parents | 8f16495dc5f2 |
children |
line wrap: on
line source
<tool id="je_demultiplex" name="Je-Demultiplex" version="@VERSION_STRING@"> <description>demultiplexes fastq files</description> <macros> <import>macros.xml</import> </macros> <expand macro="requirements" /> <stdio> <exit_code range="1:" level="fatal" description="Tool exception" /> </stdio> <expand macro="version_command" /> <command> <![CDATA[ je demultiplex ## Fastq inputs @single_or_paired_cmd@ #if str( $library.type ) != "single": @demultiplex_paired_end_cmd_options@ #end if @barcode_option_cmd@ @barcode_len_cmd@ C=$CLIP_BARCODE @demultiplexer_common_options_cmd@ @common_options_cmd@ @demultiplexer_common_output_options_cmd@ @demultiplexer_common_outputs_cmd@ ]]> </command> <configfiles> <expand macro="barcode_config_file"></expand> </configfiles> <inputs> <!-- single/paired - similar to macro 'single_or_paired_general' --> <expand macro="single_or_paired_general"> <expand macro="demultiplex_paired_end_options"/> </expand> <expand macro="barcode_option"/> <expand macro="barcode_len_option"/> <expand macro="clip_barcode"/> <expand macro="demultiplexer_common_options"/> <expand macro="common_options"/> <expand macro="demultiplexer_common_output_options"/> </inputs> <outputs> <expand macro="demultiplexer_common_outputs"/> </outputs> <tests> <test> <!-- simple test on single end data --> <param name="type" value="single"/> <param name="input_1" value="file_1_sequence.txt" ftype="fastqsanger"/> <param name="BARCODE_FILE" value="barcodes_SE.txt" ftype="tabular"/> <output name="METRICS_FILE_NAME" file="summary_SE.txt" ftype="tabular" lines_diff="4"/> <output name="DEMULTIPLEX_RESULTS" ftype="tabular"> <discovered_dataset designation="unassigned_1" file="unassigned_1_SE.txt" /> </output> </test> <test> <!-- more complex test on paired end data with different barcode for fwd/rev --> <param name="type" value="paired"/> <param name="input_1" value="file_1_sequence.txt" ftype="fastqsanger"/> <param name="input_2" value="file_2_sequence.txt" ftype="fastqsanger"/> <param name="BPOS" value="BOTH"/> <param name="BM" value="BOTH"/> <param name="BRED" value="false"/> <param name="COLLECT_OUTPUTS" value="false" /> <param name="barcode_list_type_con" value="text"/> <param name="barcode_text" value="sample1 CACTGT:GTATAG sample2 ATTCCG:TCCGTC sample3 GCTACC:TGGTCA sample4 CGAAAC:CACTGT"/> <output name="METRICS_FILE_NAME" file="summary_PE.txt" ftype="tabular" lines_diff="4"/> <output name="DEMULTIPLEX_RESULTS" ftype="tabular"> <discovered_dataset designation="unassigned_1" file="unassigned_1_PE.txt" /> <discovered_dataset designation="unassigned_2" file="unassigned_2_PE.txt" /> <discovered_dataset designation="sample4_CGAAACCACTGT_2" file="sample4_CGAAACCACTGT_2.txt"/> <discovered_dataset designation="sample4_CGAAACCACTGT_1" file="sample4_CGAAACCACTGT_1.txt"/> <discovered_dataset designation="sample3_GCTACCTGGTCA_2" file="sample3_GCTACCTGGTCA_2.txt"/> <discovered_dataset designation="sample3_GCTACCTGGTCA_1" file="sample3_GCTACCTGGTCA_1.txt"/> <discovered_dataset designation="sample2_ATTCCGTCCGTC_2" file="sample2_ATTCCGTCCGTC_2.txt"/> <discovered_dataset designation="sample2_ATTCCGTCCGTC_1" file="sample2_ATTCCGTCCGTC_1.txt"/> <discovered_dataset designation="sample1_CACTGTGTATAG_2" file="sample1_CACTGTGTATAG_2.txt"/> <discovered_dataset designation="sample1_CACTGTGTATAG_1" file="sample1_CACTGTGTATAG_1.txt"/> </output> </test> <test> <!-- Repeat of previous but with collection outputs --> <param name="type" value="paired"/> <param name="input_1" value="file_1_sequence.txt" ftype="fastqsanger"/> <param name="input_2" value="file_2_sequence.txt" ftype="fastqsanger"/> <param name="BPOS" value="BOTH"/> <param name="BM" value="BOTH"/> <param name="BRED" value="false"/> <param name="barcode_list_type_con" value="text"/> <param name="barcode_text" value="sample1 CACTGT:GTATAG sample2 ATTCCG:TCCGTC sample3 GCTACC:TGGTCA sample4 CGAAAC:CACTGT"/> <param name="COLLECT_OUTPUTS" value="true" /> <output_collection name="COLLECTION_1" type="list"> <element name="sample1_CACTGTGTATAG_1.txt" value="sample4_CGAAACCACTGT_1.txt"/> <element name="sample3_GCTACCTGGTCA_1.txt" value="sample3_GCTACCTGGTCA_1.txt"/> <element name="sample2_ATTCCGTCCGTC_1.txt" value="sample2_ATTCCGTCCGTC_1.txt"/> <element name="sample1_CACTGTGTATAG_1.txt" value="sample1_CACTGTGTATAG_1.txt"/> </output_collection> <output_collection name="COLLECTION_2" type="list"> <element name="sample4_CGAAACCACTGT_2.txt" value="sample4_CGAAACCACTGT_2.txt"/> <element name="sample3_GCTACCTGGTCA_2.txt" value="sample3_GCTACCTGGTCA_2.txt"/> <element name="sample2_ATTCCGTCCGTC_2.txt" value="sample2_ATTCCGTCCGTC_2.txt"/> <element name="sample1_CACTGTGTATAG_2.txt" value="sample1_CACTGTGTATAG_2.txt"/> </output_collection> </test> </tests> <help> <![CDATA[ **What it does** Je demultiplex: A fastq file demultiplexer with optional handling of Unique Molecular Identifiers for further use in 'markdupes' module. Input files are fastq files, and can be in gzip compressed format. Author: Charles Girardot (charles.girardot@embl.de). Wrapper by: Jelle Scholtalbers (jelle.scholtalbers@embl.de). With contributions by: Mehmet Tekman (@mtekman) ------ **Know what you are doing** .. class:: warningmark You will want to read the `documentation`__. .. __: http://gbcs.embl.de/portal/Je ------ **Parameter list** This is an exhaustive list of options:: FASTQ_FILE1=File F1=File Input fastq file (optionally gzipped) for single end data, or first read in paired end data. Required. FASTQ_FILE2=File F2=File Input fastq file (optionally gzipped) for the second read of paired end data. Default value: null. BARCODE_FILE=File BF=File Barcode file describing sequence list and sample names. Tab-delimited file with 2 columns, with the sample in col1 and the corresponding barcode in col2. Simple barcode file format : 2 tab-delimited colums If multiple barcode map to the same sample, either line can be duplicated e.g. sample1 ATAT sample1 GAGG sample2 CCAA sample2 TGTG Or barcodes can be combined using the OR operator '|' i.e. the file above can be re-written like sample1 ATAT|GAGG sample2 CCAA|TGTG Finally, for the special situation of paired-end data in which barcodes differ at both ends (ie BPOS=BOTH BRED=false BM=BOTH , see BRED option description), barcodes for read_1 and read_2 can be distinguished using a ':' separator i.e. sample1 ATAT:GAGG sample2 CCAA:TGTG This above syntax means that sample 1 is encoded with ATAT barcode at read_1 AND GAGG barcode at read_2. Note that you can still combine barcodes using | e.g. sample1 ATAT|GAGG:CCAA|TGTG would mean that sample 1 is mapped by the combination of barcode: ATAT OR GAGG at read_1 AND CCAA OR TGTG at read_2. Extended barcode file format : 3 (single-end) or 4 (paired-end) tab-delimited colums same as the simple barcode file format but the extra columns contains the file name(s) to use to name output files. A unique extra column is expected for single-end while 2 extra columns are expected for paired-end. In case, lines are duplicated (multiple barcodesmapping the same sample), the same file name should be indicated in the third (and fourth) column(s). sample1 ATAT spl1_1.txt.gz spl1_2.txt.gz sample1 GAGG spl1_1.txt.gz spl1_2.txt.gz sample2 CCAA spl2_1.txt.gz spl2_2.txt.gz Or sample1 ATAT|GAGG:CCAA|TGTG spl1_1.txt.gz spl1_2.txt.gz Ns in barcode sequence are allowed and are used to flag positions that should be ignored in sample matching i.e. they will be clipped off the read sequence (like in iCLIP protocol). Required. BARCODE_READ_POS=BarcodePosition BPOS=BarcodePosition For paired-end data, where to expect the barcode(s) : READ_1 (beginning of read from FASTQ_FILE_1), READ_2 (beginning of read from FASTQ_FILE_2), BOTH (beginning of both reads). Automatically set to READ_1 in single end mode. Default value: BOTH. This option can be set to 'null' to clear the default value. Possible values: {READ_1, READ_2, BOTH, NONE} BCLEN=String LEN=String Length of the barcode sequences, optional. Taken from barcode file when not given. In situations where BARCODE_READ_POS == BOTH AND REDUNDANT_BARCODES=false, two distinct length can be provided using the syntax LEN=X:Z where X and Z are 2 integers representing the barcode length for read_1 and read_2 respectively. Default value: null. BARCODE_FOR_SAMPLE_MATCHING=BarcodePosition BM=BarcodePosition Indicates which barcode(s) should be used for sample lookup Automatically set to READ_1 in single end mode. For paired-end data and when BARCODE_READ_POS == BOTH, which barcode should be used to resolve sample: use BM=READ_1 (beginning of read from FASTQ_FILE_1) if only this read should be used for sample matching: use BM=READ_2 (beginning of read from FASTQ_FILE_2) if only this read should be used for sample matching: use BM=BOTH (beginning of both reads) if both should be used. When BM=BOTH, the behaviour is different based on the value of REDUNDANT_BARCODES : If REDUNDANT_BARCODES=true, the two barcodes are considered to map to the same sample and 'Je demultiplex' uses the two barcodes according to the STRICT value. If REDUNDANT_BARCODES=false, the barcode file should map a couple of barcode to each sample (e.g. sample1 => AGAGTG:TTGATA) and 'Je demultiplex' needs both barcodes to find the relevant sample. Note that this is the only situation in which all barcode matching options (MM, MMD, Q) accept different values for both barcodes in the form X:Z where X and Z are 2 integers. Default value: BOTH. This option can be set to 'null' to clear the default value. Possible values: {READ_1, READ_2, BOTH, NONE} REDUNDANT_BARCODES=Boolean BRED=Boolean This option only applies for paired-end data with BARCODE_READ_POS set to 'BOTH' Indicates if both read's barcodes encode redundant information or if barcodes are supposed to be identical at both ends (or to resolve to the same sample when a pool of barcodes is used per sample). When REDUNDANT_BARCODES=false, the 2 barcodes potentially encode different information. For example, only one of the barcodes encodes the sample identity while the second barcode might be a random barcode (UMI) to tell apart PCR artefacts from real duplicates. Another example is when both barcodes should be used in a combined fashion to resolve the sample. In the first example, you should use BPOS=BOTH BRED=false BM=READ_1. In the second example, you should have BPOS=BOTH BRED=false BM=BOTH. Note that with BPOS=BOTH BRED=true BM=BOTH, the behavior would be different as 'demultiplex' would then check the STRICT option to perform sample resolution. Importantly, when BARCODE_READ_POS (BPOS) == BOTH AND REDUNDANT_BARCODES=false, BLEN, barcode matching options (MM, MMD, Q) and read trimming/clipping options (XT, ZT) accept different values for both barcodes in the form X:Z where X and Z are 2 integers. Default value: true. This option can be set to 'null' to clear the default value. Possible values: {true, false} STRICT=Boolean S=Boolean For paired-end data and when two distinct barcodes/indices are used to encode samples, this option tells if both barcodes should resolve to the same sample. When true and if only one of the two reads has a barcode match, the read pair is 'unassigned'. When false and if only one of the two reads has a barcode match, the read pair is assigned to the corresponding sample When reads resolve to different samples, the read pair is always 'unassigned'. Default value: false. This option can be set to 'null' to clear the default value. Possible values: {true, false} MAX_MISMATCHES=String MM=String Maximum mismatches for a barcode to be considered a match. In situations where both barcodes are used for sample matching i.e. BPOS=BOTH BM=BOTH (or 2 INDEX_FILE given), two distinct values can be given here using the syntax MM=X:Z where X and Z are 2 integers to use for read_1 and read_2 respectively. MM=null is like MM=0 Default value: 1. This option can be set to 'null' to clear the default value. MIN_MISMATCH_DELTA=String MMD=String Minimum difference between the number of mismatches against the best and the second best barcode. When MMD is not respected, the read remains unassigned. When two distinct barcodes are used for sample matching (dual encoding), two distinct values can be given using the syntax MMD=X:Z where X and Z are 2 integers to use for first (e.g. from read_1 or index_1) MMD=null is like MMD=0 Default value: 1. This option can be set to 'null' to clear the default value. MIN_BASE_QUALITY=String Q=String Minimum base quality during barcode matching: bases which quality is less than this cutoff are always considered as a mismatch.When two distinct barcodes are used for sample matching (dual encoding), two distinct values can be given using the syntax Q=X:Z where X and Z are 2 integers to use for first (e.g. from read_1 or index_1) and second barcode (e.g. from read_2 or index_2) respectively. Q=null is like Q=0. Default value: 10. This option can be set to 'null' to clear the default value. XTRIMLEN=String XT=String Optional extra number of base to be trimmed right after the barcode (only used if CLIP_BARCODE=true). When running paired-end, two distinct values can be given using the syntax XT=X:Z where X and Z are 2 integers to use for read_1 and read_2 respectively. Note that even when BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode as to end up with reads of the same length (note that this can also be operated using ZT). If a unique value is given, e.g. XT=1, while running paired-end the following rule applies: (1) BPOS=READ_1 or BPOS=READ_2, no trim is applied at the read w/o barcode (2) BPOS=BOTH, the value is used for both reads. Note that XT=null is like XT=0. Default value: 0. This option can be set to 'null' to clear the default value. ZTRIMLEN=String ZT=String Optional extra number of bases to be trimmed from the read end i.e. 3' end. When running paired-end, two distinct values can be given here using the syntax ZT=X:Z where X and Z are 2 integers to use for read_1 and read_2 respectively. Note that even when BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode as to end up with reads of the same length (note that this can also be operated using XT). Note that if a single value is passed, the value always applies to both reads in paired-end mode without further consideration. ZT=null is like ZT=0. Default value: 0. This option can be set to 'null' to clear the default value. CLIP_BARCODE=Boolean C=Boolean Clip barcode sequence from read sequence, as well as XTRIMLEN (and ZTRIMLEN) bases if applicable, before writing to output file. If false, reads are written without modification to output file. Apply to both barcodes when BPOS=BOTH. Default value: true. This option can be set to 'null' to clear the default value. Possible values: {true, false} ADD_BARCODE_TO_HEADER=Boolean ADD=Boolean Add barcode at the end of the read header. Apply to both barcodes when BPOS=BOTH. If true, the string ':barcode' is added at the end of the read header with a ':' added only if current read header does not end with ':'. If both reads of the pair have a barcode (i.e. BARCODE_READ_POS == BOTH), thenthe second read also has its own matched barcode written. Else, the read without a barcode receives the barcode from the barcoded read. For example: @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0: becomes: @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:BARCODE When barcodes containing random positions, i.e. 'N', (for example like in the iCLIP protocol) or are UMIs, the added sequence is the sequence clipped from the read and NOT the matched barcode. Default value: true. This option can be set to 'null' to clear the default value. Possible values: {true, false} ENSURE_IDENTICAL_HEADER_NAMES=Boolean SAME_HEADERS=Boolean Makes sure that headers of both reads of a pair are identical, using the following read header pattern (for both reads of a pair): @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 SAMPLEBARCODE_READ1:SAMPLEBARCODE_READ2(:CLIPPED_SEQ_FROMREAD1:CLIPPED_SEQ_FROMREAD2) This option only makes sense in paired end mode and ADD=true. Some (if not all) mappers will indeed complain when the read headers are not identical. When molecular barcodes are present in reads (either as additional barcodes or as degenerate barcodes ie with 'N') and the RCHAR is used, you will end with (problematic) read headers like this: HISEQ:44:C6KC0ANXX:5:1101:1491:1994:1:N:0:TAGAACAC:TGGAGTAG HISEQ:44:C6KC0ANXX:5:1101:1491:1994:3:N:0:TAGAACAC:CGTTGTAT SAME_HEADERS=true will instead generates the following identical header for both reads: HISEQ:44:C6KC0ANXX:5:1101:1491:1994:TAGAACAC:TGGAGTAG:CGTTGTAT Note that we also clipped the useless '1:N:0' and '3:N:0' has they will also result in generating different headers. Important: this option will force RCHAR=: UNLESS you specify RCHAR=null ; in which case a space will be preserved ie: HISEQ:44:C6KC0ANXX:5:1101:1491:1994 TAGAACAC:TGGAGTAG:CGTTGTAT Default value: true. This option can be set to 'null' to clear the default value. Possible values: {true, false} READ_NAME_REPLACE_CHAR=String RCHAR=String Replace spaces in read name/header using provided character. This is particularly handy when you need to retain ADDed barcode in read name/header during mapping (everything after space in read name is usually clipped in BAM files). For example, with RCHAR=':': @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0: becomes @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965:2:N:0:BARCODE Default value: null. QUALITY_FORMAT=FastqQualityFormat V=FastqQualityFormat A value describing how the quality values are encoded in the fastq. Either 'Solexa' for pre-pipeline 1.3 style scores (solexa scaling + 66), 'Illumina' for pipeline 1.3 and above (phred scaling + 64) or 'Standard' for phred scaled scores with a character shift of 33. If this value is not specified (or 'null' is given), the quality format will be detected. Default value: Standard. This option can be set to 'null' to clear the default value. Possible values: {Solexa, Illumina, Standard} KEEP_UNASSIGNED_READ=Boolean UN=Boolean Should un-assigned reads be saved in files or simply ignored. File names are automatically created or can be given using UF1 & UF2 options. Default value: true. This option can be set to 'null' to clear the default value. Possible values: {true, false} BARCODE_DIAG_FILE=String DIAG=String Name for a barcode match reporting file (not generated by default).Either a name (in which case the file will be created in the output dir) or full path. This file will contain a line per read pair with the barcode best matching the read subsequence or 'null' when no match is found according to matching parameters ; and the final selected sample. This file is useful for debugging or further processing in case both ends are barcoded. N.B: this file will have a size of about one of the fastq input files. Default value: null. ]]> </help> <expand macro="citations"/> </tool>