Mercurial > repos > gbcs-embl-heidelberg > je_demultiplex_illu

diff je-demultiplex-illu.xml @ 0:111ba1180318 draft
Initial upload
author: gbcs-embl-heidelberg
date: Wed, 25 Nov 2015 12:36:37 -0500
children: 01fdc6d10660
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/je-demultiplex-illu.xml	Wed Nov 25 12:36:37 2015 -0500
@@ -0,0 +1,428 @@
+<tool id="je_demultiplex_illu" name="Je-Demultiplex-Illu" version="1.0">
+    <description>demultiplexes fastq files using Illumina Index file</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <stdio>
+        <exit_code range="1:" level="fatal" description="Tool exception" />
+    </stdio>
+    <version_command>echo '1.0'</version_command>
+    <command interpreter="bash">
+<![CDATA[
+    je demultiplex-illu
+
+    ## Fastq inputs
+    @single_or_paired_illu_cmd@
+
+    @barcode_option_cmd@
+
+    #if str($INTERNAL_BARCODES_CON.INTERNAL_BARCODES) == 'true':
+        BPOS=${INTERNAL_BARCODES_CON.BPOS}
+        C=${INTERNAL_BARCODES_CON.CLIP_BARCODE}
+        #if str( $INTERNAL_BARCODES_CON.LEN ) != "":
+            BCLEN=$INTERNAL_BARCODES_CON.LEN
+        #end if
+    #else:
+        BPOS=NONE
+        C=false
+    #end if
+
+    @common_options_cmd@
+
+    @demultiplexer_common_output_options_cmd@
+    @demultiplexer_common_outputs_cmd@
+]]>
+    </command>
+    <configfiles>
+        <expand macro="barcode_config_file"/>
+    </configfiles>
+    <inputs>
+        <!-- single/paired - similar to macro 'single_or_paired_general' -->
+        <expand macro="single_or_paired_illu">
+            <expand macro="demultiplex_illu_paired_end_options"/>
+        </expand>
+        <expand macro="barcode_option"/>
+        <conditional name="INTERNAL_BARCODES_CON">
+            <param name="INTERNAL_BARCODES" type="select"
+                label="Do your reads contain Unique Molecular Identifiers(UMIs)">
+                <option value="true">Yes</option>
+                <option value="false" selected="true">No</option>
+            </param>
+            <when value="true">
+                <param name="BPOS" type="select" label="Barcode read position (BPOS)" help="where are the barcodes.
+                    If not using paired-end it does not matter what you specify here.">
+                    <option value="READ_1" selected="true">READ_1 (beginning of read from the first fastq file)</option>
+                    <option value="READ_2">READ_2 (beginning of read from the second fastq file)</option>
+                    <option value="BOTH">BOTH (beginning of both reads)</option>
+                </param>
+                <expand macro="barcode_len_option"/>
+                <expand macro="clip_barcode"/>
+            </when>
+            <when value="false"/>
+        </conditional>
+
+        <expand macro="demultiplexer_common_options"/>
+
+        <expand macro="common_options"/>
+
+        <expand macro="demultiplexer_common_output_options"/>
+
+    </inputs>
+    <outputs>
+        <expand macro="demultiplexer_common_outputs"/>
+    </outputs>
+
+    <tests>
+        <test>
+            <!-- barcode at both ends, non-redundant -->
+            <param name="type" value="paired"/>
+            <param name="input_1" value="illu_file_1_sequence.txt" ftype="fastqsanger"/>
+            <param name="input_2" value="illu_file_2_sequence.txt" ftype="fastqsanger"/>
+            <param name="I1" value="illu_file_1_index.txt" ftype="fastqsanger"/>
+            <param name="I2_AVAILABLE" value="true"/>
+            <param name="I2" value="illu_file_2_index.txt" ftype="fastqsanger"/>
+
+            <param name="INTERNAL_BARCODES" value="true"/>
+            <param name="barcode_list_type_con" value="file"/>
+            <param name="BARCODE_FILE" value="illu_dualindexing.txt" ftype="tabular"/>
+            <param name="LEN" value="8:8"/>
+            <param name="ZT" value="5:6"/>
+            <param name="BPOS" value="BOTH"/>
+            <param name="BM" value="BOTH"/>
+            <param name="BRED" value="false"/>
+            <param name="MM" value="3"/>
+            <param name="MMD" value="2"/>
+            <param name="Q" value="20"/>
+	    <param name="DIAG" value="false"/>
+            <output name="METRICS_FILE_NAME" file="illu_summary_PE.txt" ftype="tabular" lines_diff="4">
+                <discovered_dataset designation="unassigned_1" file="illu_unassigned_1_PE.txt" />
+                <discovered_dataset designation="unassigned_2" file="illu_unassigned_2_PE.txt" />
+                <discovered_dataset designation="emb681m5_GGACTCCTCTCTCTAT_2" file="emb681m5_GGACTCCTCTCTCTAT_2.txt"/>
+                <discovered_dataset designation="emb681m5_GGACTCCTCTCTCTAT_1" file="emb681m5_GGACTCCTCTCTCTAT_1.txt"/>
+                <discovered_dataset designation="emb681m4_TCCTGAGCCTCTCTAT_2" file="emb681m4_TCCTGAGCCTCTCTAT_2.txt"/>
+                <discovered_dataset designation="emb681m4_TCCTGAGCCTCTCTAT_1" file="emb681m4_TCCTGAGCCTCTCTAT_1.txt"/>
+                <discovered_dataset designation="emb681m1_TAAGGCGACTCTCTAT_2" file="emb681m1_TAAGGCGACTCTCTAT_2.txt"/>
+                <discovered_dataset designation="emb681m1_TAAGGCGACTCTCTAT_1" file="emb681m1_TAAGGCGACTCTCTAT_1.txt"/>
+                <discovered_dataset designation="emb6801m2_AGGCAGAATAGATCGC_2" file="emb6801m2_AGGCAGAATAGATCGC_2.txt"/>
+                <discovered_dataset designation="emb6801m2_AGGCAGAATAGATCGC_1" file="emb6801m2_AGGCAGAATAGATCGC_1.txt"/>
+                <discovered_dataset designation="emb6801m1_CGTACTAGTAGATCGC_2" file="emb6801m1_CGTACTAGTAGATCGC_2.txt"/>
+                <discovered_dataset designation="emb6801m1_CGTACTAGTAGATCGC_1" file="emb6801m1_CGTACTAGTAGATCGC_1.txt"/>
+            </output>
+        </test>
+    </tests>
+
+
+  <help>
+<![CDATA[
+**What it does**
+
+Je demultiplex-illu: demultiplex fastq files using Illumina Index files,
+with optional handling of Unique Molecular Identifiers for further use in 'markdupes' module
+Input files are fastq files, and can be in gzip compressed format.
+
+Author: Charles Girardot  (charles.girardot@embl.de).
+
+Wrapper by: Jelle Scholtalbers (jelle.scholtalbers@embl.de).
+
+------
+
+**Know what you are doing**
+
+.. class:: warningmark
+
+  You will want to read the `documentation`__.
+
+  .. __: http://gbcs.embl.de/portal/Je
+
+------
+
+**Parameter list**
+
+This is an exhaustive list of options::
+
+  FASTQ_FILE1=File
+  F1=File
+
+    Input fastq file (optionally gzipped) for single end data, or first read in paired end
+    data.
+
+    Required.
+
+  FASTQ_FILE2=File
+  F2=File
+
+    Input fastq file (optionally gzipped) for the second read of paired end data.
+
+    Default value: null.
+
+  INDEX_FILE1=File
+  I1=File
+
+    Fastq file for index 1 (barcode) reads, optionally gzipped
+
+    Required.
+
+  INDEX_FILE2=File
+  I2=File
+
+    Fastq file for index 2 (barcode) reads, optionally gzipped.
+    A INDEX_FILE1 MUST be provided when INDEX_FILE2 is given. This situation corresponds to
+    Illumina dual indexing.
+
+    Default value: null.
+
+  BARCODE_FILE=File
+  BF=File
+
+    Barcode file describing sequence list and sample names. Tab-delimited file with 2
+    columns, with the sample in col1 and the corresponding barcode in col2.
+    Simple barcode file format : 2 tab-delimited colums
+    If multiple barcode map to the same sample, either line can be duplicated e.g.
+      sample1  ATAT
+      sample1  GAGG
+      sample2  CCAA
+      sample2  TGTG
+    Or barcodes can be combined using the OR operator '|' i.e. the file above can be
+    re-written like
+      sample1  ATAT|GAGG
+      sample2  CCAA|TGTG
+    Finally, for the special situation of paired-end data in which barcodes differ at both
+    ends (ie BPOS=BOTH BRED=false BM=BOTH , see BRED option description), barcodes for read_1
+    and read_2 can be distinguished using a ':' separator i.e.
+      sample1  ATAT:GAGG
+      sample2  CCAA:TGTG
+    This above syntax means that sample 1 is encoded with ATAT barcode at read_1 AND GAGG
+    barcode at read_2. Note that you can still combine barcodes using | e.g.
+    sample1  ATAT|GAGG:CCAA|TGTG
+    would mean that sample 1 is mapped by the combination of barcode: ATAT OR GAGG at read_1
+    AND CCAA OR TGTG at read_2.
+    Extended barcode file format : 3 (single-end) or 4 (paired-end) tab-delimited colums
+    same as the simple barcode file format but the extra columns contains the file name(s)
+    to use to name output files. A unique extra column is expected for single-end while 2
+    extra columns are expected for paired-end. In case, lines are duplicated (multiple
+    barcodesmapping the same sample), the same file name should be indicated in the third
+    (and fourth) column(s).
+      sample1  ATAT  spl1_1.txt.gz  spl1_2.txt.gz
+      sample1  GAGG  spl1_1.txt.gz  spl1_2.txt.gz
+      sample2  CCAA  spl2_1.txt.gz  spl2_2.txt.gz
+    Or
+      sample1  ATAT|GAGG:CCAA|TGTG  spl1_1.txt.gz  spl1_2.txt.gz
+    Ns in barcode sequence are allowed and are used to flag positions that should be ignored
+    in sample matching
+    i.e. they will be clipped off the read sequence (like in iCLIP protocol).
+
+    Required.
+
+  BARCODE_READ_POS=BarcodePosition
+  BPOS=BarcodePosition
+
+    Indicates the location of additional barcodes present in the read(s). Setting this option
+    implies setting the LEN option.
+    Importantly, these additional barcodes must not encode sample identity information but
+    used for e.g. molecular barcoding (UMIs) or for any purpose other than sample identity encoding.
+
+    Default value: BOTH. This option can be set to 'null' to clear the default value.
+    Possible values: {READ_1, READ_2, BOTH, NONE}
+
+  BCLEN=String
+  LEN=String
+
+    Length of the barcode sequences, optional. Taken from barcode file when not given.
+    In situations where BARCODE_READ_POS == BOTH AND REDUNDANT_BARCODES=false, two distinct
+    length can be provided using the syntax LEN=X:Z where X and Z are 2 integers representing
+    the barcode length for read_1 and read_2 respectively.
+
+    Default value: null
+
+  REDUNDANT_BARCODES=Boolean
+  BRED=Boolean
+
+    This option only applies for paired-end data with *both* INDEX_FILE1 and INDEX_FILE2
+    provided.
+    Indicates if both index barcodes encode redundant information i.e. if both barcodes are
+    supposed to be identical (or resolve to the same sample when a pool of barcodes is used
+    per sample).
+    When BRED=true, the STRICT option guides the sample lookup behavior	When BRED=false,
+    barcodes are combined prior to sample lookup.
+
+    Default value: true. This option can be set to 'null' to clear the default value.
+    Possible values: {true, false}
+
+  STRICT=Boolean
+  S=Boolean
+
+    For paired-end data and when two distinct barcodes/indices are used to encode samples,
+    this option tells if both barcodes should resolve to the same sample.
+    When true and if only one of the two reads has a barcode match, the read pair is
+    'unassigned'.
+    When false and if only one of the two reads has a barcode match, the read pair is
+    assigned to the
+    corresponding sample
+    When reads resolve to different samples, the read pair is always 'unassigned'.
+
+    Default value: false. This option can be set to 'null' to clear the default value.
+    Possible values: {true, false}
+
+  MAX_MISMATCHES=String
+  MM=String
+
+    Maximum mismatches for a barcode to be considered a match. In situations where both
+    barcodes are used for sample matching i.e. BPOS=BOTH BM=BOTH (or 2 INDEX_FILE given), two
+    distinct
+    values can be given here using the syntax MM=X:Z where X and Z are 2 integers to use for
+    read_1 and read_2 respectively.
+    MM=null is like MM=0
+
+    Default value: 1. This option can be set to 'null' to clear the default value.
+
+  MIN_MISMATCH_DELTA=String
+  MMD=String
+
+    Minimum difference between the number of mismatches against the best and the second best
+    barcode. When MMD is not respected, the read remains unassigned.
+    When two distinct barcodes are used for sample matching (dual encoding), two distinct
+    values can be given using the syntax MMD=X:Z where X and Z are 2 integers to use for
+    first (e.g. from read_1 or index_1)
+    MMD=null is like MMD=0
+
+    Default value: 1. This option can be set to 'null' to clear the default value.
+
+  MIN_BASE_QUALITY=String
+  Q=String
+
+    Minimum base quality during barcode matching: bases which quality is less than this
+    cutoff are always considered as a mismatch.When two distinct barcodes are used for sample
+    matching (dual encoding), two distinct values can be given using the syntax Q=X:Z where X
+    and Z are 2 integers to use for first (e.g. from read_1 or index_1) and second barcode
+    (e.g. from read_2 or index_2) respectively.
+    Q=null is like Q=0.
+
+    Default value: 10. This option can be set to 'null' to clear the default value.
+
+  XTRIMLEN=String
+  XT=String
+
+    Optional extra number of base to be trimmed right after the barcode (only used if
+    CLIP_BARCODE=true).
+    When running paired-end, two distinct values can be given using the syntax XT=X:Z where X
+    and Z are 2 integers to use for read_1 and read_2 respectively. Note that even when
+    BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode as to
+    end up with reads of the same length (note that this can also be operated using ZT). If a
+    unique value is given, e.g. XT=1, while running paired-end the following rule applies:
+      (1) BPOS=READ_1 or BPOS=READ_2, no trim is applied at the read w/o barcode
+      (2) BPOS=BOTH, the value is used for both reads.
+
+    Note that XT=null is like XT=0.
+    Default value: 0. This option can be set to 'null' to clear the default value.
+
+  ZTRIMLEN=String
+  ZT=String
+
+    Optional extra number of bases to be trimmed from the read end i.e. 3' end.
+    When running paired-end, two distinct values can be given here using the syntax ZT=X:Z
+    where X and Z are 2 integers to use for read_1 and read_2 respectively. Note that even
+    when BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode
+    as to end up with reads of the same length (note that this can also be operated using
+    XT). Note that if a single value is passed, the value always applies to both reads in
+    paired-end mode without further consideration.
+    ZT=null is like ZT=0.
+
+    Default value: 0. This option can be set to 'null' to clear the default value.
+
+  CLIP_BARCODE=Boolean
+  C=Boolean
+
+    Clip barcode sequence from read sequence, as well as XTRIMLEN (and ZTRIMLEN) bases if
+    applicable, before writing to output file.
+    If false, reads are written without modification to output file.
+    Apply to both barcodes when BPOS=BOTH.
+
+    Default value: true. This option can be set to 'null' to clear the default value.
+    Possible values: {true, false}
+
+  ADD_BARCODE_TO_HEADER=Boolean
+  ADD=Boolean
+
+    Add matched barcode at the end of the read header. Applies to both index when INDEX_FILE2
+    is also provided.
+    First the sample encoding barcodes from I1 (and I2 when relevant) are added to the read
+    headers like
+      @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:I1_BARCODE:I2_BARCODE
+    Then, if BPOS!=NONE, the additional barcodes (UMIs) clipped from the read(s) are added
+    to their own header, like
+      @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:I1_BARCODE:I2_BARCODE:CLIPPED_SEQ_FROMREAD
+
+    Default value: true. This option can be set to 'null' to clear the default value.
+    Possible values: {true, false}
+
+  ENSURE_IDENTICAL_HEADER_NAMES=Boolean
+  SAME_HEADERS=Boolean
+
+    Makes sure that headers of both reads of a pair are identical, using the following read
+    header pattern (for both reads of a pair):
+      @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 I1_BARCODE:I2_BARCODE(:CLIPPED_SEQ_FROMREAD1:CLIPPED_SEQ_FROMREAD2)
+    This option only makes sense in paired end mode and ADD=true. Some (if not all) mappers
+    will indeed complain when the read headers are not identical. When molecular barcodes are
+    present in reads and the RCHAR is used, you will end with (problematic) read headers like
+    this:
+      HISEQ:44:C6KC0ANXX:5:1101:1491:1994:1:N:0:TAGAACAC:TGGAGTAG
+      HISEQ:44:C6KC0ANXX:5:1101:1491:1994:3:N:0:TAGAACAC:CGTTGTAT
+    SAME_HEADERS=true will instead genetates the following identical header for both reads:
+      HISEQ:44:C6KC0ANXX:5:1101:1491:1994:TAGAACAC:TGGAGTAG:CGTTGTAT
+    Note that we also clipped the useless '1:N:0' and '3:N:0' has they will also result in
+    generating different headers
+    Important: this option will force RCHAR=: UNLESS you specify RCHAR=null ; in which
+    case a space will be preserved ie:
+      HISEQ:44:C6KC0ANXX:5:1101:1491:1994 TAGAACAC:TGGAGTAG:CGTTGTAT
+
+    Default value: true. This option can be set to 'null' to clear the default value.
+    Possible values: {true, false}
+
+  READ_NAME_REPLACE_CHAR=String
+  RCHAR=String
+
+    Replace spaces in read name/header using provided character. This is particularly handy
+    when you need to retain ADDed barcode in read name/header during mapping (everything
+    after space in read name is usually clipped in BAM files). For example, with RCHAR=':':
+    '@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:'
+    becomes
+    '@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965:2:N:0:BARCODE'
+    Default value: null.
+
+  QUALITY_FORMAT=FastqQualityFormat
+  V=FastqQualityFormat
+
+    A value describing how the quality values are encoded in the fastq.  Either 'Solexa' for
+    pre-pipeline 1.3 style scores (solexa scaling + 66), 'Illumina' for pipeline 1.3 and
+    above (phred scaling + 64) or 'Standard' for phred scaled scores with a character shift
+    of 33.  If this value is not specified (or 'null' is given), the quality format will be
+    detected.
+
+    Default value: Standard. This option can be set to 'null' to clear the default value.
+    Possible values: {Solexa, Illumina, Standard}
+
+  KEEP_UNASSIGNED_READ=Boolean
+  UN=Boolean
+
+    Should un-assigned reads be saved in files or simply ignored. File names are
+    automatically created or can be given using UF1 & UF2 options.
+
+    Default value: true. This option can be set to 'null' to clear the default value.
+    Possible values: {true, false}
+
+  BARCODE_DIAG_FILE=String
+  DIAG=String
+
+    Name for a barcode match reporting file (not generated by default).Either a name (in
+    which case the file will be created in the output dir) or full path. This file will
+    contain a line per read pair with the barcode best matching the read subsequence or
+    'null' when no match is found according to matching parameters ; and the final selected
+    sample. This file is useful for debugging or further processing in case both ends are
+    barcoded.
+    N.B: this file will have a size of about one of the fastq input files.
+
+    Default value: null.
+]]>
+  </help>
+
+</tool>
author	gbcs-embl-heidelberg
date	Wed, 25 Nov 2015 12:36:37 -0500
parents
children	01fdc6d10660