Mercurial > repos > gbcs-embl-heidelberg > je_demultiplex

<tool id="je_demultiplex" name="Je-Demultiplex" version="@VERSION_STRING@">
    <description>demultiplexes fastq files</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <stdio>
        <exit_code range="1:" level="fatal" description="Tool exception" />
    </stdio>
    <version_command>echo '1.0'</version_command>
    <command interpreter="bash">
<![CDATA[
    je demultiplex

    ## Fastq inputs
    @single_or_paired_cmd@
    #if str( $library.type ) != "single":
        @demultiplex_paired_end_cmd_options@
    #end if

    @barcode_option_cmd@
    @barcode_len_cmd@
    C=$CLIP_BARCODE

    @demultiplexer_common_options_cmd@
    @common_options_cmd@

    @demultiplexer_common_output_options_cmd@
    @demultiplexer_common_outputs_cmd@

]]>
    </command>
    <configfiles>
        <expand macro="barcode_config_file"></expand>
    </configfiles>
    <inputs>
        <!-- single/paired - similar to macro 'single_or_paired_general' -->
        <expand macro="single_or_paired_general">
            <expand macro="demultiplex_paired_end_options"/>
        </expand>

        <expand macro="barcode_option"/>
        <expand macro="barcode_len_option"/>
        <expand macro="clip_barcode"/>

        <expand macro="demultiplexer_common_options"/>

        <expand macro="common_options"/>

        <expand macro="demultiplexer_common_output_options"/>

    </inputs>
    <outputs>
        <expand macro="demultiplexer_common_outputs"/>
    </outputs>

    <tests>
        <test>
            <!-- simple test on single end data -->
            <param name="type" value="single"/>
            <param name="input_1" value="file_1_sequence.txt" ftype="fastqsanger"/>
            <param name="BARCODE_FILE" value="barcodes_SE.txt" ftype="tabular"/>
            <output name="METRICS_FILE_NAME" file="summary_SE.txt" ftype="tabular" lines_diff="4">
                <discovered_dataset designation="unassigned_1" file="unassigned_1_SE.txt" />
            </output>
        </test>
        <test>
            <!-- more complex test on paired end data with different barcode for fwd/rev -->
            <param name="type" value="paired"/>
            <param name="input_1" value="file_1_sequence.txt" ftype="fastqsanger"/>
            <param name="input_2" value="file_2_sequence.txt" ftype="fastqsanger"/>

            <param name="BPOS" value="BOTH"/>
            <param name="BM" value="BOTH"/>
            <param name="BRED" value="false"/>

            <param name="barcode_list_type_con" value="text"/>
            <param name="barcode_text"
                value="sample1 CACTGT:GTATAG&#10;sample2 ATTCCG:TCCGTC&#10;sample3 GCTACC:TGGTCA&#10;sample4 CGAAAC:CACTGT"/>
            <output name="METRICS_FILE_NAME" file="summary_PE.txt" ftype="tabular" lines_diff="4">
                <discovered_dataset designation="unassigned_1" file="unassigned_1_PE.txt" />
                <discovered_dataset designation="unassigned_2" file="unassigned_2_PE.txt" />
                <discovered_dataset designation="sample4_CGAAACCACTGT_2" file="sample4_CGAAACCACTGT_2.txt"/>
                <discovered_dataset designation="sample4_CGAAACCACTGT_1" file="sample4_CGAAACCACTGT_1.txt"/>
                <discovered_dataset designation="sample3_GCTACCTGGTCA_2" file="sample3_GCTACCTGGTCA_2.txt"/>
                <discovered_dataset designation="sample3_GCTACCTGGTCA_1" file="sample3_GCTACCTGGTCA_1.txt"/>
                <discovered_dataset designation="sample2_ATTCCGTCCGTC_2" file="sample2_ATTCCGTCCGTC_2.txt"/>
                <discovered_dataset designation="sample2_ATTCCGTCCGTC_1" file="sample2_ATTCCGTCCGTC_1.txt"/>
                <discovered_dataset designation="sample1_CACTGTGTATAG_2" file="sample1_CACTGTGTATAG_2.txt"/>
                <discovered_dataset designation="sample1_CACTGTGTATAG_1" file="sample1_CACTGTGTATAG_1.txt"/>
            </output>
        </test>
    </tests>


  <help>
<![CDATA[
**What it does**

Je demultiplex: A fastq file demultiplexer with optional handling of Unique Molecular Identifiers for further use
in 'markdupes' module.
Input files are fastq files, and can be in gzip compressed format.

Author: Charles Girardot  (charles.girardot@embl.de).

Wrapper by: Jelle Scholtalbers (jelle.scholtalbers@embl.de).

------

**Know what you are doing**

.. class:: warningmark

  You will want to read the `documentation`__.

  .. __: http://gbcs.embl.de/portal/Je

------

**Parameter list**

This is an exhaustive list of options::

  FASTQ_FILE1=File
  F1=File

    Input fastq file (optionally gzipped) for single end data, or first read in paired end
    data.

    Required.

  FASTQ_FILE2=File
  F2=File

    Input fastq file (optionally gzipped) for the second read of paired end data.

    Default value: null.

  BARCODE_FILE=File
  BF=File

    Barcode file describing sequence list and sample names. Tab-delimited file with 2
    columns, with the sample in col1 and the corresponding barcode in col2.
    Simple barcode file format : 2 tab-delimited colums
    If multiple barcode map to the same sample, either line can be duplicated e.g.
      sample1  ATAT
      sample1  GAGG
      sample2  CCAA
      sample2  TGTG
    Or barcodes can be combined using the OR operator '|' i.e. the file above can be
    re-written like
      sample1  ATAT|GAGG
      sample2  CCAA|TGTG
    Finally, for the special situation of paired-end data in which barcodes differ at both
    ends (ie BPOS=BOTH BRED=false BM=BOTH , see BRED option description), barcodes for read_1
    and read_2 can be distinguished using a ':' separator i.e.
      sample1  ATAT:GAGG
      sample2  CCAA:TGTG
    This above syntax means that sample 1 is encoded with ATAT barcode at read_1 AND GAGG
    barcode at read_2. Note that you can still combine barcodes using | e.g.
    sample1  ATAT|GAGG:CCAA|TGTG
    would mean that sample 1 is mapped by the combination of barcode: ATAT OR GAGG at read_1
    AND CCAA OR TGTG at read_2.
    Extended barcode file format : 3 (single-end) or 4 (paired-end) tab-delimited colums
    same as the simple barcode file format but the extra columns contains the file name(s)
    to use to name output files. A unique extra column is expected for single-end while 2
    extra columns are expected for paired-end. In case, lines are duplicated (multiple
    barcodesmapping the same sample), the same file name should be indicated in the third
    (and fourth) column(s).
      sample1  ATAT  spl1_1.txt.gz  spl1_2.txt.gz
      sample1  GAGG  spl1_1.txt.gz  spl1_2.txt.gz
      sample2  CCAA  spl2_1.txt.gz  spl2_2.txt.gz
    Or
      sample1  ATAT|GAGG:CCAA|TGTG  spl1_1.txt.gz  spl1_2.txt.gz
    Ns in barcode sequence are allowed and are used to flag positions that should be ignored
    in sample matching
    i.e. they will be clipped off the read sequence (like in iCLIP protocol).

    Required.

  BARCODE_READ_POS=BarcodePosition
  BPOS=BarcodePosition

    For paired-end data, where to expect the barcode(s) :
      READ_1 (beginning of read from FASTQ_FILE_1),
      READ_2 (beginning of read from FASTQ_FILE_2),
      BOTH (beginning of both reads).
    Automatically set to READ_1 in single end mode.

    Default value: BOTH. This option can be set to 'null' to clear the default value.
    Possible values: {READ_1, READ_2, BOTH, NONE}

  BCLEN=String
  LEN=String

    Length of the barcode sequences, optional. Taken from barcode file when not given.
    In situations where BARCODE_READ_POS == BOTH AND REDUNDANT_BARCODES=false, two distinct
    length can be provided using the syntax LEN=X:Z where X and Z are 2 integers representing
    the barcode length for read_1 and read_2 respectively.

    Default value: null.

  BARCODE_FOR_SAMPLE_MATCHING=BarcodePosition
  BM=BarcodePosition

    Indicates which barcode(s) should be used for sample lookup
    Automatically set to READ_1 in single end mode.
    For paired-end data and when BARCODE_READ_POS == BOTH, which barcode should be used to
    resolve sample:
      use BM=READ_1 (beginning of read from FASTQ_FILE_1) if only this read should be used
    for sample matching:
      use BM=READ_2 (beginning of read from FASTQ_FILE_2) if only this read should be used
    for sample matching:
      use BM=BOTH (beginning of both reads) if both should be used.

    When BM=BOTH, the behaviour is different based on the value of REDUNDANT_BARCODES :
    If REDUNDANT_BARCODES=true, the two barcodes are considered to map to the same sample
    and 'Je demultiplex' uses the two barcodes according to the STRICT value.
    If REDUNDANT_BARCODES=false, the barcode file should map a couple of barcode to each
    sample (e.g. sample1 => AGAGTG:TTGATA) and 'Je demultiplex' needs both barcodes to find
    the relevant sample. Note that this is the only situation in which all barcode matching
    options (MM, MMD, Q) accept different values for both barcodes in the form X:Z where X
    and Z are 2 integers.

    Default value: BOTH. This option can be set to 'null' to clear the default value.
    Possible values: {READ_1, READ_2, BOTH, NONE}


  REDUNDANT_BARCODES=Boolean
  BRED=Boolean

    This option only applies for paired-end data with BARCODE_READ_POS set to 'BOTH'
    Indicates if both read's barcodes encode redundant information or if barcodes are
    supposed to be identical at both ends (or to resolve to the same sample when a pool of
    barcodes is used per sample).
    When REDUNDANT_BARCODES=false, the 2 barcodes potentially encode
    different information. For example, only one of the barcodes encodes the sample identity
    while
    the second barcode might be a random barcode (UMI) to tell apart PCR artefacts from real
    duplicates.
    Another example is when both barcodes should be used in a combined fashion to resolve the
    sample.
    In the first example, you should use BPOS=BOTH BRED=false BM=READ_1.
    In the second example, you should have BPOS=BOTH BRED=false BM=BOTH.
    Note that with BPOS=BOTH BRED=true BM=BOTH, the behavior would be different as
    'demultiplex' would then check the STRICT option to perform sample resolution.
    Importantly, when BARCODE_READ_POS (BPOS) == BOTH AND REDUNDANT_BARCODES=false, BLEN,
    barcode matching options (MM, MMD, Q) and read trimming/clipping options (XT, ZT) accept
    different values for both barcodes in the form X:Z where X and Z are 2 integers.

    Default value: true. This option can be set to 'null' to clear the default value.
    Possible values: {true, false}

  STRICT=Boolean
  S=Boolean

    For paired-end data and when two distinct barcodes/indices are used to encode samples,
    this option tells if both barcodes should resolve to the same sample.
    When true and if only one of the two reads has a barcode match, the read pair is
    'unassigned'.
    When false and if only one of the two reads has a barcode match, the read pair is
    assigned to the
    corresponding sample
    When reads resolve to different samples, the read pair is always 'unassigned'.

    Default value: false. This option can be set to 'null' to clear the default value.
    Possible values: {true, false}

  MAX_MISMATCHES=String
  MM=String

    Maximum mismatches for a barcode to be considered a match. In situations where both
    barcodes are used for sample matching i.e. BPOS=BOTH BM=BOTH (or 2 INDEX_FILE given), two
    distinct
    values can be given here using the syntax MM=X:Z where X and Z are 2 integers to use for
    read_1 and read_2 respectively.
    MM=null is like MM=0

    Default value: 1. This option can be set to 'null' to clear the default value.

  MIN_MISMATCH_DELTA=String
  MMD=String

    Minimum difference between the number of mismatches against the best and the second best
    barcode. When MMD is not respected, the read remains unassigned.
    When two distinct barcodes are used for sample matching (dual encoding), two distinct
    values can be given using the syntax MMD=X:Z where X and Z are 2 integers to use for
    first (e.g. from read_1 or index_1)
    MMD=null is like MMD=0

    Default value: 1. This option can be set to 'null' to clear the default value.

  MIN_BASE_QUALITY=String
  Q=String

    Minimum base quality during barcode matching: bases which quality is less than this
    cutoff are always considered as a mismatch.When two distinct barcodes are used for sample
    matching (dual encoding), two distinct values can be given using the syntax Q=X:Z where X
    and Z are 2 integers to use for first (e.g. from read_1 or index_1) and second barcode
    (e.g. from read_2 or index_2) respectively.
    Q=null is like Q=0.

    Default value: 10. This option can be set to 'null' to clear the default value.

  XTRIMLEN=String
  XT=String

    Optional extra number of base to be trimmed right after the barcode (only used if
    CLIP_BARCODE=true).
    When running paired-end, two distinct values can be given using the syntax XT=X:Z where X
    and Z are 2 integers to use for read_1 and read_2 respectively. Note that even when
    BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode as to
    end up with reads of the same length (note that this can also be operated using ZT). If a
    unique value is given, e.g. XT=1, while running paired-end the following rule applies:
      (1) BPOS=READ_1 or BPOS=READ_2, no trim is applied at the read w/o barcode
      (2) BPOS=BOTH, the value is used for both reads.

    Note that XT=null is like XT=0.
    Default value: 0. This option can be set to 'null' to clear the default value.

  ZTRIMLEN=String
  ZT=String

    Optional extra number of bases to be trimmed from the read end i.e. 3' end.
    When running paired-end, two distinct values can be given here using the syntax ZT=X:Z
    where X and Z are 2 integers to use for read_1 and read_2 respectively. Note that even
    when BPOS=READ_1 or BPOS=READ_2, a X:Y synthax can be given to trim the read w/o barcode
    as to end up with reads of the same length (note that this can also be operated using
    XT). Note that if a single value is passed, the value always applies to both reads in
    paired-end mode without further consideration.
    ZT=null is like ZT=0.

    Default value: 0. This option can be set to 'null' to clear the default value.

  CLIP_BARCODE=Boolean
  C=Boolean

    Clip barcode sequence from read sequence, as well as XTRIMLEN (and ZTRIMLEN) bases if
    applicable, before writing to output file.
    If false, reads are written without modification to output file.
    Apply to both barcodes when BPOS=BOTH.

    Default value: true. This option can be set to 'null' to clear the default value.
    Possible values: {true, false}

  ADD_BARCODE_TO_HEADER=Boolean
  ADD=Boolean

    Add barcode at the end of the read header. Apply to both barcodes when BPOS=BOTH.
    If true, the string ':barcode' is added at the end of the read header with a ':' added
    only if current read header does not end with ':'.
    If both reads of the pair have a barcode (i.e. BARCODE_READ_POS == BOTH), thenthe second
    read also has its own matched barcode written. Else, the read without a barcode receives
    the barcode from the barcoded read.
    For example:
      @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:
    becomes:
      @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:BARCODE

    When barcodes containing random positions, i.e. 'N', (for example like 	in the iCLIP
    protocol) or are UMIs, the added sequence is the sequence clipped from the read and NOT
    the matched barcode.

    Default value: true. This option can be set to 'null' to clear the default value.
    Possible values: {true, false}


  ENSURE_IDENTICAL_HEADER_NAMES=Boolean
  SAME_HEADERS=Boolean

    Makes sure that headers of both reads of a pair are identical, using the following read
    header pattern (for both reads of a pair):
      @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 SAMPLEBARCODE_READ1:SAMPLEBARCODE_READ2(:CLIPPED_SEQ_FROMREAD1:CLIPPED_SEQ_FROMREAD2)
    This option only makes sense in
    paired end mode and ADD=true. Some (if not all) mappers will indeed complain when the
    read headers are not identical. When molecular barcodes are present in reads (either as
    additional barcodes or as degenerate barcodes ie with 'N') and the RCHAR is used, you
    will end with (problematic) read headers like this:
      HISEQ:44:C6KC0ANXX:5:1101:1491:1994:1:N:0:TAGAACAC:TGGAGTAG
      HISEQ:44:C6KC0ANXX:5:1101:1491:1994:3:N:0:TAGAACAC:CGTTGTAT
    SAME_HEADERS=true will instead generates the following identical header for both reads:
      HISEQ:44:C6KC0ANXX:5:1101:1491:1994:TAGAACAC:TGGAGTAG:CGTTGTAT
    Note that we also clipped the useless '1:N:0' and '3:N:0' has they will also result in
    generating different headers.
    Important: this option will force RCHAR=: UNLESS you specify RCHAR=null ; in which
    case a space will be preserved ie:
      HISEQ:44:C6KC0ANXX:5:1101:1491:1994 TAGAACAC:TGGAGTAG:CGTTGTAT

    Default value: true. This option can be set to 'null' to clear the default value.
    Possible values: {true, false}


  READ_NAME_REPLACE_CHAR=String
  RCHAR=String

    Replace spaces in read name/header using provided character. This is particularly handy
    when you need to retain ADDed barcode in read name/header during mapping (everything
    after space in read name is usually clipped in BAM files). For example, with RCHAR=':':
      @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965 2:N:0:
    becomes
      @D3FCO8P1:178:C1WLBACXX:7:1101:1836:1965:2:N:0:BARCODE
    Default value: null.

  QUALITY_FORMAT=FastqQualityFormat
  V=FastqQualityFormat

    A value describing how the quality values are encoded in the fastq.  Either 'Solexa' for
    pre-pipeline 1.3 style scores (solexa scaling + 66), 'Illumina' for pipeline 1.3 and
    above (phred scaling + 64) or 'Standard' for phred scaled scores with a character shift
    of 33.  If this value is not specified (or 'null' is given), the quality format will be
    detected.

    Default value: Standard. This option can be set to 'null' to clear the default value.
    Possible values: {Solexa, Illumina, Standard}

  KEEP_UNASSIGNED_READ=Boolean
  UN=Boolean

    Should un-assigned reads be saved in files or simply ignored. File names are
    automatically created or can be given using UF1 & UF2 options.

    Default value: true. This option can be set to 'null' to clear the default value.
    Possible values: {true, false}

  BARCODE_DIAG_FILE=String
  DIAG=String

    Name for a barcode match reporting file (not generated by default).Either a name (in
    which case the file will be created in the output dir) or full path. This file will
    contain a line per read pair with the barcode best matching the read subsequence or
    'null' when no match is found according to matching parameters ; and the final selected
    sample. This file is useful for debugging or further processing in case both ends are
    barcoded.
    N.B: this file will have a size of about one of the fastq input files.

    Default value: null.
]]>
  </help>

</tool>
author	gbcs-embl-heidelberg
date	Wed, 21 Jun 2017 05:23:13 -0400
parents	424f44e2124e
children	222819c87d90