view je-demultiplex-illu.xml @ 5:69c77f9fc064 draft

planemo upload for repository https://git.embl.de/grp-gbcs/Je/tree/master/src/galaxy commit 0eefd837333dae6fbecaf4f55b053268d844eff6
author gbcs-embl-heidelberg
date Wed, 02 Aug 2017 10:59:09 -0400
parents 01fdc6d10660
children 370d9764f670
line wrap: on
line source

<tool id="je_demultiplex_illu" name="Je-Demultiplex-Illu" version="@VERSION_STRING@">
    <description>demultiplexes fastq files using Illumina Index file</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <stdio>
        <exit_code range="1:" level="fatal" description="Tool exception" />
    </stdio>
    <expand macro="version_command" />
    <command>
<![CDATA[
    je demultiplex-illu

    ## Fastq inputs
    @single_or_paired_illu_cmd@

    @barcode_option_cmd@

    #if str($INTERNAL_BARCODES_CON.INTERNAL_BARCODES) == 'true':
        BPOS=${INTERNAL_BARCODES_CON.BPOS}
        C=${INTERNAL_BARCODES_CON.CLIP_BARCODE}
        #if str( $INTERNAL_BARCODES_CON.LEN ) != "":
            BCLEN=$INTERNAL_BARCODES_CON.LEN
        #end if
    #else:
        BPOS=NONE
        C=false
    #end if

    @common_options_cmd@

    @demultiplexer_common_output_options_cmd@
    @demultiplexer_common_outputs_cmd@
]]>
    </command>
    <configfiles>
        <expand macro="barcode_config_file"/>
    </configfiles>
    <inputs>
        <!-- single/paired - similar to macro 'single_or_paired_general' -->
        <expand macro="single_or_paired_illu">
            <expand macro="demultiplex_illu_paired_end_options"/>
        </expand>
        <expand macro="barcode_option"/>
        <conditional name="INTERNAL_BARCODES_CON">
            <param name="INTERNAL_BARCODES" type="select"
                label="Do your reads contain Unique Molecular Identifiers(UMIs)">
                <option value="true">Yes</option>
                <option value="false" selected="true">No</option>
            </param>
            <when value="true">
                <param name="BPOS" type="select" label="Barcode read position (BPOS)" help="where are the barcodes.
                    If not using paired-end it does not matter what you specify here.">
                    <option value="READ_1" selected="true">READ_1 (beginning of read from the first fastq file)</option>
                    <option value="READ_2">READ_2 (beginning of read from the second fastq file)</option>
                    <option value="BOTH">BOTH (beginning of both reads)</option>
                </param>
                <expand macro="barcode_len_option"/>
                <expand macro="clip_barcode"/>
            </when>
            <when value="false"/>
        </conditional>

        <expand macro="demultiplexer_common_options"/>

        <expand macro="common_options"/>

        <expand macro="demultiplexer_common_output_options"/>

    </inputs>
    <outputs>
        <expand macro="demultiplexer_common_outputs"/>
    </outputs>

    <tests>
        <test>
            <!-- barcode at both ends, non-redundant -->
            <param name="type" value="paired"/>
            <param name="input_1" value="illu_file_1_sequence.txt" ftype="fastqsanger"/>
            <param name="input_2" value="illu_file_2_sequence.txt" ftype="fastqsanger"/>
            <param name="I1" value="illu_file_1_index.txt" ftype="fastqsanger"/>
            <param name="I2_AVAILABLE" value="true"/>
            <param name="I2" value="illu_file_2_index.txt" ftype="fastqsanger"/>

            <param name="INTERNAL_BARCODES" value="true"/>
            <param name="barcode_list_type_con" value="file"/>
            <param name="BARCODE_FILE" value="illu_dualindexing.txt" ftype="tabular"/>
            <param name="LEN" value="8:8"/>
            <param name="ZT" value="5:6"/>
            <param name="BPOS" value="BOTH"/>
            <param name="BM" value="BOTH"/>
            <param name="BRED" value="false"/>
            <param name="MM" value="3"/>
            <param name="MMD" value="2"/>
            <param name="Q" value="20"/>
	        <param name="DIAG" value="false"/>
            <output name="METRICS_FILE_NAME" file="illu_summary_PE.txt" ftype="tabular" lines_diff="4">
                <discovered_dataset designation="unassigned_1" file="illu_unassigned_1_PE.txt" />
                <discovered_dataset designation="unassigned_2" file="illu_unassigned_2_PE.txt" />
                <discovered_dataset designation="emb681m5_GGACTCCTCTCTCTAT_2" file="emb681m5_GGACTCCTCTCTCTAT_2.txt"/>
                <discovered_dataset designation="emb681m5_GGACTCCTCTCTCTAT_1" file="emb681m5_GGACTCCTCTCTCTAT_1.txt"/>
                <discovered_dataset designation="emb681m4_TCCTGAGCCTCTCTAT_2" file="emb681m4_TCCTGAGCCTCTCTAT_2.txt"/>
                <discovered_dataset designation="emb681m4_TCCTGAGCCTCTCTAT_1" file="emb681m4_TCCTGAGCCTCTCTAT_1.txt"/>
                <discovered_dataset designation="emb681m1_TAAGGCGACTCTCTAT_2" file="emb681m1_TAAGGCGACTCTCTAT_2.txt"/>
                <discovered_dataset designation="emb681m1_TAAGGCGACTCTCTAT_1" file="emb681m1_TAAGGCGACTCTCTAT_1.txt"/>
                <discovered_dataset designation="emb6801m2_AGGCAGAATAGATCGC_2" file="emb6801m2_AGGCAGAATAGATCGC_2.txt"/>
                <discovered_dataset designation="emb6801m2_AGGCAGAATAGATCGC_1" file="emb6801m2_AGGCAGAATAGATCGC_1.txt"/>
                <discovered_dataset designation="emb6801m1_CGTACTAGTAGATCGC_2" file="emb6801m1_CGTACTAGTAGATCGC_2.txt"/>
                <discovered_dataset designation="emb6801m1_CGTACTAGTAGATCGC_1" file="emb6801m1_CGTACTAGTAGATCGC_1.txt"/>
            </output>
        </test>
    </tests>

    <help>
<![CDATA[
**What it does**

Je demultiplex-illu: demultiplex fastq files using Illumina Index files,
with optional handling of Unique Molecular Identifiers for further use in 'markdupes' module
Input files are fastq files, and can be in gzip compressed format.

Author: Charles Girardot  (charles.girardot@embl.de).

Wrapper by: Jelle Scholtalbers (jelle.scholtalbers@embl.de).

------

**Know what you are doing**

.. class:: warningmark

  You will want to read the `documentation`__.

  .. __: http://gbcs.embl.de/portal/Je

------

**Parameter list**

This is an exhaustive list of options::

  FASTQ_FILE1=File
  F1=File

    Input fastq file (optionally gzipped) for single end data, or first read in paired end
    data.

    Required.

  FASTQ_FILE2=File
  F2=File

    Input fastq file (optionally gzipped) for the second read of paired end data.

    Default value: null.

  INDEX_FILE1=File
  I1=File

    Fastq file for index 1 (barcode) reads, optionally gzipped

    Required.

  INDEX_FILE2=File
  I2=File

    Fastq file for index 2 (barcode) reads, optionally gzipped.
    A INDEX_FILE1 MUST be provided when INDEX_FILE2 is given. This situation corresponds to
    Illumina dual indexing.

    Default value: null.

  BARCODE_FILE=File
  BF=File

    Barcode file describing sequence list and sample names. Tab-delimited file with 2
    columns, with the sample in col1 and the corresponding barcode in col2.
    Simple barcode file format : 2 tab-delimited colums
    If multiple barcode map to the same sample, either line can be duplicated e.g.
      sample1  ATAT
      sample1  GAGG
      sample2  CCAA
      sample2  TGTG
    Or barcodes can be combined using the OR operator '|' i.e. the file above can be
    re-written like
      sample1  ATAT|GAGG
      sample2  CCAA|TGTG
    Finally, for the special situation of paired-end data in which barcodes differ at both
    ends (ie BPOS=BOTH BRED=false BM=BOTH , see BRED option description), barcodes for read_1
    and read_2 can be distinguished using a ':' separator i.e.
      sample1  ATAT:GAGG
      sample2  CCAA:TGTG
    This above syntax means that sample 1 is encoded with ATAT barcode at read_1 AND GAGG
    barcode at read_2. Note that you can still combine barcodes using | e.g.
    sample1  ATAT|GAGG:CCAA|TGTG
    would mean that sample 1 is mapped by the combination of barcode: ATAT OR GAGG at read_1
    AND CCAA OR TGTG at read_2.
    Extended barcode file format : 3 (single-end) or 4 (paired-end) tab-delimited colums
    same as the simple barcode file format but the extra columns contains the file name(s)
    to use to name output files. A unique extra column is expected for single-end while 2
    extra columns are expected for paired-end. In case, lines are duplicated (multiple
    barcodesmapping the same sample), the same file name should be indicated in the third
    (and fourth) column(s).
      sample1  ATAT  spl1_1.txt.gz  spl1_2.txt.gz
      sample1  GAGG  spl1_1.txt.gz  spl1_2.txt.gz
      sample2  CCAA  spl2_1.txt.gz  spl2_2.txt.gz
    Or
      sample1  ATAT|GAGG:CCAA|TGTG  spl1_1.txt.gz  spl1_2.txt.gz
    Ns in barcode sequence are allowed and are used to flag positions that should be ignored
    in sample matching
    i.e. they will be clipped off the read sequence (like in iCLIP protocol).

    Required.

  BARCODE_READ_POS=BarcodePosition
  BPOS=BarcodePosition

    Indicates the location of additional barcodes present in the read(s). Setting this option
    implies setting the LEN option.
    Importantly, these additional barcodes must not encode sample identity information but
    used for e.g. molecular barcoding (UMIs) or for any purpose other than sample identity encoding.

    Default value: BOTH. This option can be set to 'null' to clear the default value.
    Possible values: {READ_1, READ_2, BOTH, NONE}

  BCLEN=String
  LEN=String

    Length of the barcode sequences, optional. Taken from barcode file when not given.
    In situations where BARCODE_READ_POS == BOTH AND REDUNDANT_BARCODES=false, two distinct
    length can be provided using the syntax LEN=X:Z where X and Z are 2 integers representing
    the barcode length for read_1 and read_2 respectively.

    Default value: null

  REDUNDANT_BARCODES=Boolean
  BRED=Boolean

    This option only applies for paired-end data with *both* INDEX_FILE1 and INDEX_FILE2
    provided.
    Indicates if both index barcodes encode redundant information i.e. if both barcodes are
    supposed to be identical (or resolve to the same sample when a pool of barcodes is used
    per sample).
    When BRED=true, the STRICT option guides the sample lookup behavior	When BRED=false,
    barcodes are combined prior to sample lookup.

    Default value: true. This option can be set to 'null' to clear the default value.
    Possible values: {true, false}

  STRICT=Boolean
  S=Boolean

    For paired-end data and when two distinct barcodes/indices are used to encode samples,
    this option tells if both barcodes should resolve to the same sample.
    When true and if only one of the two reads has a barcode match, the read pair is
    'unassigned'.
    When false and if only one of the two reads has a barcode match, the read pair is
    assigned to the
    corresponding sample
    When reads resolve to different samples, the read pair is always 'unassigned'.

    Default value: false. This option can be set to 'null' to clear the default value.
    Possible values: {true, false}

  MAX_MISMATCHES=String
  MM=String

    Maximum mismatches for a barcode to be considered a match. In situations where both
    barcodes are used for sample matching i.e. BPOS=BOTH BM=BOTH (or 2 INDEX_FILE given), two
    distinct
    values can be given here using the syntax MM=X:Z where X and Z are 2 integers to use for
    read_1 and read_2 respectively.
    MM=null is like MM=0

    Default value: 1. This option can be set to 'null' to clear the default value.

  MIN_MISMATCH_DELTA=String
  MMD=String

    Minimum difference between the number of mismatches against the best and the second best
    barcode. When MMD is not respected, the read remains unassigned.
    When two distinct barcodes are used for sample matching (dual encoding), two distinct
    values can be given using the syntax MMD=X:Z where X and Z are 2 integers to use for
    first (e.g. from read_1 or index_1)
    MMD=null is like MMD=0

    Default value: 1. This option can be set to 'null' to clear the default value.

  MIN_BASE_QUALITY=String
  Q=String

    Minimum base quality during barcode matching: bases which quality is less than this
    cutoff are always considered as a mismatch.When two distinct barcodes are used for sample
    matching (dual encoding), two distinct values can be given using the syntax Q=X:Z where X
    and Z are 2 integers to use for first (e.g. from read_1 or index_1) and second barcode
    (e.g. from read_2 or index_2) respectively.
    Q=null is like Q=0.

    Default value: 10. This option can be set to 'null' to clear the default value.

  XTRIMLEN=String
  XT=String

    Optional extra number of base to be trimmed right after the barcode (only used if
    CLIP_BARCODE=true).
    When running paired-end, two distinct values can be given using the syntax XT=X:Z where X
    and Z are 2 integers to use for read_1 and read_2 respectively. Note that even when
    BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode as to
    end up with reads of the same length (note that this can also be operated using ZT). If a
    unique value is given, e.g. XT=1, while running paired-end the following rule applies:
      (1) BPOS=READ_1 or BPOS=READ_2, no trim is applied at the read w/o barcode
      (2) BPOS=BOTH, the value is used for both reads.

    Note that XT=null is like XT=0.
    Default value: 0. This option can be set to 'null' to clear the default value.

  ZTRIMLEN=String
  ZT=String

    Optional extra number of bases to be trimmed from the read end i.e. 3' end.
    When running paired-end, two distinct values can be given here using the syntax ZT=X:Z
    where X and Z are 2 integers to use for read_1 and read_2 respectively. Note that even
    when BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode
    as to end up with reads of the same length (note that this can also be operated using
    XT). Note that if a single value is passed, the value always applies to both reads in
    paired-end mode without further consideration.
    ZT=null is like ZT=0.

    Default value: 0. This option can be set to 'null' to clear the default value.

  CLIP_BARCODE=Boolean
  C=Boolean

    Clip barcode sequence from read sequence, as well as XTRIMLEN (and ZTRIMLEN) bases if
    applicable, before writing to output file.
    If false, reads are written without modification to output file.
    Apply to both barcodes when BPOS=BOTH.

    Default value: true. This option can be set to 'null' to clear the default value.
    Possible values: {true, false}

  ADD_BARCODE_TO_HEADER=Boolean
  ADD=Boolean

    Add matched barcode at the end of the read header. Applies to both index when INDEX_FILE2
    is also provided.
    First the sample encoding barcodes from I1 (and I2 when relevant) are added to the read
    headers like
      @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:I1_BARCODE:I2_BARCODE
    Then, if BPOS!=NONE, the additional barcodes (UMIs) clipped from the read(s) are added
    to their own header, like
      @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:I1_BARCODE:I2_BARCODE:CLIPPED_SEQ_FROMREAD

    Default value: true. This option can be set to 'null' to clear the default value.
    Possible values: {true, false}

  ENSURE_IDENTICAL_HEADER_NAMES=Boolean
  SAME_HEADERS=Boolean

    Makes sure that headers of both reads of a pair are identical, using the following read
    header pattern (for both reads of a pair):
      @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 I1_BARCODE:I2_BARCODE(:CLIPPED_SEQ_FROMREAD1:CLIPPED_SEQ_FROMREAD2)
    This option only makes sense in paired end mode and ADD=true. Some (if not all) mappers
    will indeed complain when the read headers are not identical. When molecular barcodes are
    present in reads and the RCHAR is used, you will end with (problematic) read headers like
    this:
      HISEQ:44:C6KC0ANXX:5:1101:1491:1994:1:N:0:TAGAACAC:TGGAGTAG
      HISEQ:44:C6KC0ANXX:5:1101:1491:1994:3:N:0:TAGAACAC:CGTTGTAT
    SAME_HEADERS=true will instead genetates the following identical header for both reads:
      HISEQ:44:C6KC0ANXX:5:1101:1491:1994:TAGAACAC:TGGAGTAG:CGTTGTAT
    Note that we also clipped the useless '1:N:0' and '3:N:0' has they will also result in
    generating different headers
    Important: this option will force RCHAR=: UNLESS you specify RCHAR=null ; in which
    case a space will be preserved ie:
      HISEQ:44:C6KC0ANXX:5:1101:1491:1994 TAGAACAC:TGGAGTAG:CGTTGTAT

    Default value: true. This option can be set to 'null' to clear the default value.
    Possible values: {true, false}

  READ_NAME_REPLACE_CHAR=String
  RCHAR=String

    Replace spaces in read name/header using provided character. This is particularly handy
    when you need to retain ADDed barcode in read name/header during mapping (everything
    after space in read name is usually clipped in BAM files). For example, with RCHAR=':':
    '@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:'
    becomes
    '@D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965:2:N:0:BARCODE'
    Default value: null.

  QUALITY_FORMAT=FastqQualityFormat
  V=FastqQualityFormat

    A value describing how the quality values are encoded in the fastq.  Either 'Solexa' for
    pre-pipeline 1.3 style scores (solexa scaling + 66), 'Illumina' for pipeline 1.3 and
    above (phred scaling + 64) or 'Standard' for phred scaled scores with a character shift
    of 33.  If this value is not specified (or 'null' is given), the quality format will be
    detected.

    Default value: Standard. This option can be set to 'null' to clear the default value.
    Possible values: {Solexa, Illumina, Standard}

  KEEP_UNASSIGNED_READ=Boolean
  UN=Boolean

    Should un-assigned reads be saved in files or simply ignored. File names are
    automatically created or can be given using UF1 & UF2 options.

    Default value: true. This option can be set to 'null' to clear the default value.
    Possible values: {true, false}

  BARCODE_DIAG_FILE=String
  DIAG=String

    Name for a barcode match reporting file (not generated by default).Either a name (in
    which case the file will be created in the output dir) or full path. This file will
    contain a line per read pair with the barcode best matching the read subsequence or
    'null' when no match is found according to matching parameters ; and the final selected
    sample. This file is useful for debugging or further processing in case both ends are
    barcoded.
    N.B: this file will have a size of about one of the fastq input files.

    Default value: null.
]]>
    </help>
    <expand macro="citations"/>
</tool>