Mercurial > repos > iuc > umi_tools_count

<tool id="umi_tools_count" name="UMI-tools count" version="@VERSION@.1">
    <description>performs quantification of UMIs from BAM files</description>
    <macros>
        <import>macros.xml</import>
        <xml name="sanitize_tag" >
            <sanitizer invalid_char="">
                <valid initial="string.letters,string.digits" />
            </sanitizer>
        </xml>
    </macros>
    <expand macro="requirements" />
    <command detect_errors="exit_code"><![CDATA[
    ln -s '${input_bam}' 'input.bam' &&
    ln -s '${input_bam.metadata.bam_index}' 'input.bam.bai' &&

    umi_tools count
            -I input.bam
            '$paired'
            --extract-umi-method='$barcodes.extract_umi_method.value'
            #if str($barcodes.extract_umi_method) == 'read_id':
            --umi-separator='$barcodes.umi_separator.value'
            #else if str($barcodes.extract_umi_method) == 'tag':
            --umi-tag='$barcodes.umi_tag.value'
            --cell-tag='$barcodes.cell_tag.value'
            #end if
            --method='$method.value'
            --edit-distance-threshold='$edit_distance_threshold'
            --mapping-quality='$advanced.mapping_quality'
            --per-gene
            '$wide_format_cell_counts'
            '$advanced.per_contig'
            '$advanced.per_cell'
            #if str($advanced.gene_tag) != "":
            --gene-tag='$advanced.gene_tag.value'
            #end if
            #if str($advanced.skip_tags_regex) != "":
            --skip-tags-regex='$advanced.skip_tags_regex.value'
            #end if
            #if '$advanced.random_seed' != 0:
            --random-seed='$advanced.random_seed'
            #end if
            -S '$out_counts'
    ]]></command>
    <inputs>
        <param name="input_bam" type="data" format="bam" label="Sorted BAM file" help="Please use the samtools sort tool to ensure a correct BAM input" />
        <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" checked="false" label="Bam is paired-end" help="both read pairs will be output. This will also force the use of the template length to determine reads with the same mapping coordinates." />
        <conditional name="barcodes" >
            <param argument="--extract-umi-method" name="extract_umi_method" type="select" label="Umi Extract Method" help="How are the barcodes encoded in the read?" >
                <option value="read_id" selected="true">Barcodes are contained at the end of the read seperated by a delimiter</option>
                <option value="tag" >Barcodes are contained in tags</option>
                <option value="umis" >Barcodes were extracted using umis</option>
            </param>
            <when value="read_id" >
                <param argument="--umi-separator" name="umi_separator" type="text" label="Delimiter between read id and the UMI" value="_" >
                    <sanitizer invalid_char="" >
                        <valid initial="string.punctuation" />
                    </sanitizer>
                </param>
            </when>
            <when value="tag" >
                <param argument="--umi-tag" name="umi_tag" type="text" label="Tag which contains the UMI" >
                    <expand macro="sanitize_tag" />
                </param>
                <param argument="--cell-tag" name="cell_tag" type="text" label="Tag which contains the cell barcode" >
                    <expand macro="sanitize_tag" />
                </param>
            </when>
            <when value="umis"></when>
        </conditional>
        <param argument="--method"  type="select" label="Method to identify group of reads" help="UMIs with the same (or similar) codes can be grouped together. The simplest methods 'unique' and 'percentile' group identical
UMIs, however 'cluster', 'adjacency', and 'directional' can group similar umis with edit distances less than some threshold. Unique: Reads group share the exact same UMI. Percentile: Reads group share the same UMI, and UMIs with
counts &lt; 1% of the median counts for UMIs at the same position are ignored. Cluster: Identify clusters of connected UMIs (based on hamming distance threshold). Adjacency: Same as cluster, but considers only directly ajacent UMIs in the cluster. Directional: Identify cluster of connected UMIs based on hamming distance and umi." >
            <option value="unique" >Unique</option>
            <option value="percentile">Percentile</option>
            <option value="cluster">Cluster</option>
            <option value="adjacency">Adjacency</option>
            <option value="directional" selected="true" >Directional</option>
        </param>
        <param argument="--edit-distance-threshold" name="edit_distance_threshold" type="integer" label="Edit distance threshold" min="0" value="1" />
        <param argument="--wide-format-cell-counts" name="wide_format_cell_counts" type="boolean" truevalue="--wide-format-cell-counts" falsevalue="" checked="true" label="Output a matrix of genes and cells, instead of a flat file" />
        <section name="advanced" title="Extra parameters" >
            <param argument="--mapping-quality" name="mapping_quality" type="integer" min="0" value="0" label="Minimum mapping quality" />
            <!-- Currently hard-coded parameter. Leave here if useful to future wrapper  -->
            <!-- <param argument="-\-per-gene" name="per_gene" type="text" label="Group reads together if they have the same gene" help="Reads will be grouped together if they have the same gene. This is useful if your library
prep generates PCR duplicates with non-identical alignment positions such as CEL-Seq. Note this option is hardcoded to be on with the count command. I.e counting is always performed per-gene. Must be combined with either
-\-gene-tag or -\-per-contig option" /> -->
            <param argument="--gene-tag" name="gene_tag" type="text" label="Deduplicate per gene." help="The gene information is encoded in the bam read tag." value="XT" >
                <expand macro="sanitize_tag" />
            </param>
            <param argument="--skip-tags-regex" name="skip_tags_regex" type="text" label="Skip any reads where the gene matches this tag" value="" >
                <sanitizer invalid_char="">
                    <valid initial="string.letters,string.digits">
                        <add value="!="/>
                        <add value="-"/>
                        <add value="_"/>
                        <add value="."/>
                        <add value="?"/>
                        <add value="&lt;"/><!-- left triangle bracket -->
                        <add value="&gt;"/><!-- right triangle bracket -->
                        <add value="&#91;"/> <!-- left square bracket -->
                        <add value="&#93;"/> <!-- right square bracket -->
                        <add value="&#94;"/> <!-- caret -->
                        <add value="&#123;"/> <!-- left curly -->
                        <add value="&#125;"/> <!-- right curly -->
                        <add value="&#40;"/> <!-- left parenthesis -->
                        <add value="&#41;"/> <!-- right parenthesis -->
                    </valid>
                </sanitizer>
            </param>
            <param argument="--per-contig" name="per_contig" type="boolean" truevalue="--per-contig" falsevalue="" checked="false" label="Deduplicate per contig (field 3 in BAM; RNAME)"  help="All reads with the same contig will be considered to have the same alignment position. This is useful if you have aligned to a reference transcriptome with one transcript per gene." />
            <param argument="--per-cell" name="per_cell" type="boolean" truevalue="--per-cell" falsevalue="" checked="true" label="Group reads only if they have the same cell barcode." />
            <param argument="--random-seed" name="random_seed" type="integer" min="0" value="0" label="Random Seed" />
        </section>
    </inputs>
    <outputs>
        <data name="out_counts" format="tabular" />
    </outputs>
    <tests>
        <test><!--count_single_gene_tag:-->
            <param name="input_bam" value="chr19_gene_tags.bam" />
            <param name="random_seed" value="123456789" />
            <param name="method" value="directional" />
            <param name="gene_tag" value="XF" />
            <param name="skip_tags_regex" value="^[__|Unassigned]" />
            <param name="extract_umi_method" value="umis" />
            <param name="wide_format_cell_counts" value="false" />
            <param name="per_cell" value="false" />
            <output name="out_counts" value="count_single_gene_tag.tsv" />
        </test>
        <test><!--count_single_cells_gene_tag:-->
            <param name="input_bam" value="chr19_gene_tags.bam" />
            <param name="random_seed" value="123456789" />
            <param name="method" value="directional" />
            <param name="gene_tag" value="XF" />
            <param name="skip_tags_regex" value="^[__|Unassigned]" />
            <param name="per_cell" value="true" />
            <param name="extract_umi_method" value="umis" />
            <param name="wide_format_cell_counts" value="false" />
            <output name="out_counts" value="count_single_cells_gene_tag.tsv" />
        </test>
        <test><!--count_single_cells_wide_gene_tag:-->
            <param name="input_bam" value="chr19_gene_tags.bam" />
            <param name="random_seed" value="123456789" />
            <param name="method" value="directional" />
            <param name="gene_tag" value="XF" />
            <param name="skip_tags_regex" value="^[__|Unassigned]" />
            <param name="per_cell" value="true" />
            <param name="extract_umi_method" value="umis" />
            <param name="wide_format_cell_counts" value="true" />
            <output name="out_counts" value="count_single_cells_gene_tag_wide.tsv" />
        </test>
        <test><!-- count ENSDARG00000019692, with defaults -->
            <param name="input_bam" value="fc.ENSDARG00000019692.bam" />
            <param name="method" value="unique" />
            <output name="out_counts" value="fc.ENSDARG00000019692.counts" />
        </test>
    </tests>
    <help><![CDATA[

UMI Tools count - Count reads per gene from BAM using UMIs
----------------------------------------------------------

Purpose
-------

The purpose of this command is to count the number of reads per gene based
on the mapping co-ordinate and the UMI attached to the read.


It is assumed that the FASTQ files were processed with extract_umi.py
before mapping and thus the UMI is the last word of the read name. e.g:

@HISEQ:87:00000000_AATT

where AATT is the UMI sequeuence.

If you have used an alternative method which does not separate the
read id and UMI with a "_", such as bcl2fastq which uses ":", you can
specify the separator, or if your UMIs are encoded in a tag you can also specify this.

    ]]></help>
    <expand macro="citations" />
</tool>
author	iuc
date	Sat, 14 Jul 2018 06:14:24 -0400
parents	8db56d2f8b72
children	b557acca0b56