view umi-tools_counts.xml @ 11:9570563fb686 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit be699fdf0360f0535f52564e5b59be9b84712b14
author iuc
date Sat, 28 Sep 2024 16:40:55 +0000
parents e654095ab143
children 71ad4a56c40c
line wrap: on
line source

<tool id="umi_tools_count" name="UMI-tools count" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>performs quantification of UMIs from BAM files</description>
    <expand macro="bio_tools"/>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements">
        <!-- TODO see comment in LINK_SAM_BAM_INPUT -->
        <requirement type="package" version="1.12">samtools</requirement>
        <requirement type="package" version="4.7">sed</requirement>
    </expand>
    <command detect_errors="exit_code"><![CDATA[
        #import re
        @LINK_SAM_BAM_INPUT@        

        umi_tools count
            '$wide_format_cell_counts'
            @BARCODE_OPTIONS@
            @UMI_GROUPING_OPTIONS@
            @SC_OPTIONS@
            @SAMBAM_OPTIONS@
            @ADVANCED_OPTIONS@
            -I '$input_file' -S '$out_counts'
            @LOG@
        #if str($cond_extra.prepender) != "none":
            #if str($cond_extra.prepender) == "string":
                #set $replacer = str($cond_extra.custom_label)
            #else
                #set $replacer = re.sub('[^\w\_]+', '_', str($input.element_identifier.rsplit('.',1)[0]))
            #end if
            && sed -i -r '1s|\b([ACGT]+)\b|'"$replacer"'_\1|g' '$out_counts'
        #end if
    ]]></command>
    <inputs>
        <param name="input" type="data" format="sam,bam" label="Reads to deduplicate in SAM or BAM format" help="Please use the samtools sort tool to ensure a correct BAM input" />
        <param argument="--wide-format-cell-counts" name="wide_format_cell_counts" type="boolean" truevalue="--wide-format-cell-counts" falsevalue="" checked="true" label="Output a matrix of genes and cells, instead of a flat file" />
        <expand macro="barcode_options_macro"/>
        <expand macro="umi_grouping_options_macro"/>
        <expand macro="sambam_options_macro"/>
        <expand macro="sc_options_macro"/>
        <expand macro="advanced_options_macro"/>
        <conditional name="cond_extra" >
            <param name="prepender" type="select" label="Prepend a label to all column headers" help="This preserves uniqueness when merging with other files with the same headers. Note: filename must not contain a '.' character" >
                <option value="none" selected="true" >No modifications</option>
                <option value="string">Custom Label</option>
                <option value="dataset name">Dataset Name</option>
            </param>
            <when value="none"></when>
            <when value="dataset name"></when>
            <when value="string">
                <param name="custom_label" type="text" label="Label to Prepend" >
                    <sanitizer invalid_char="">
                        <valid initial="string.letters,string.digits">
                            <add value="-"/>
                            <add value="_"/>
                            <add value="."/>
                        </valid>
                    </sanitizer>
                </param>
            </when>
        </conditional>
        <expand macro="log_input_macro"/>
    </inputs>
    <outputs>
        <data name="out_counts" format="tabular" />
        <expand macro="log_output_macro"/>
    </outputs>
    <tests>
        <test><!--count_single_gene_tag:-->
            <param name="input" value="chr19_gene_tags.bam" />
            <section name="advanced">
                <param name="random_seed" value="123456789" />
            </section>
            <section name="sc">
                <param name="gene_tag" value="XF" />
                <param name="skip_tags_regex" value="^[__|Unassigned]" />
                <param name="per_cell" value="false" />
            </section>
            <conditional name="bc">
                <param name="extract_umi_method" value="umis" />
            </conditional>
            <section name="umi">
                <param name="method" value="directional" />
            </section>
            <param name="wide_format_cell_counts" value="false" />
            <output name="out_counts" value="count_single_gene_tag.tsv" />
        </test>
        <test><!--count_single_gene_tag .. with sam input-->
            <param name="input" value="chr19_gene_tags.sam" />
            <section name="advanced">
                <param name="random_seed" value="123456789" />
            </section>
            <section name="sc">
                <param name="gene_tag" value="XF" />
                <param name="skip_tags_regex" value="^[__|Unassigned]" />
                <param name="per_cell" value="false" />
            </section>
            <conditional name="bc">
                <param name="extract_umi_method" value="umis" />
            </conditional>
            <section name="umi">
                <param name="method" value="directional" />
            </section>
            <param name="wide_format_cell_counts" value="false" />
            <output name="out_counts" value="count_single_gene_tag.tsv" />
        </test>
        <test><!--count_single_cells_gene_tag:-->
            <param name="input" value="chr19_gene_tags.bam" />
            <section name="advanced">
                <param name="random_seed" value="123456789" />
            </section>
            <section name="sc">
                <param name="gene_tag" value="XF" />
                <param name="skip_tags_regex" value="^[__|Unassigned]" />
                <param name="per_cell" value="true" />
            </section>
            <conditional name="bc">
                <param name="extract_umi_method" value="umis" />
            </conditional>
            <section name="umi">
                <param name="method" value="directional" />
            </section>
            <param name="wide_format_cell_counts" value="false" />
            <output name="out_counts" value="count_single_cells_gene_tag.tsv" />
        </test>
        <test><!--count_single_cells_wide_gene_tag:-->
            <param name="input" value="chr19_gene_tags.bam" />
            <section name="advanced">
                <param name="random_seed" value="123456789" />
            </section>
            <section name="sc">
                <param name="gene_tag" value="XF" />
                <param name="skip_tags_regex" value="^[__|Unassigned]" />
                <param name="per_cell" value="true" />
            </section>
            <conditional name="bc">
                <param name="extract_umi_method" value="umis" />
            </conditional>
            <section name="umi">
                <param name="method" value="directional" />
            </section>
            <param name="wide_format_cell_counts" value="true" />
            <output name="out_counts" value="count_single_cells_gene_tag_wide.tsv" />
        </test>
        <test><!-- count ENSDARG00000019692, with defaults -->
            <param name="input" value="fc.ENSDARG00000019692.bam" />
            <section name="advanced">
                <param name="random_seed" value="0" />
            </section>
            <section name="sc">
                <param name="gene_tag" value="XT" />
                <param name="per_cell" value="true" />
            </section>
            <section name="umi">
                <param name="method" value="unique" />
            </section>
            <output name="out_counts" value="fc.ENSDARG00000019692.counts" />
        </test>
        <test><!-- count ENSDARG00000019692, relabel string -->
            <param name="input" value="fc.ENSDARG00000019692.bam" />
            <section name="advanced">
                <param name="random_seed" value="0" />
            </section>
            <section name="sc">
                <param name="gene_tag" value="XT" />
                <param name="per_cell" value="true" />
            </section>
            <section name="umi">
                <param name="method" value="unique" />
            </section>
            <conditional name="cond_extra" >
                <param name="prepender" value="string" />
                <param name="custom_label" value="test" />
            </conditional>
            <output name="out_counts" value="fc.ENSDARG00000019692.counts.test" />
        </test>
        <test><!-- count ENSDARG00000019692, relabel filename -->
            <param name="input" value="fc.ENSDARG00000019692.bam" />
            <section name="advanced">
                <param name="random_seed" value="0" />
            </section>
            <section name="sc">
                <param name="gene_tag" value="XT" />
                <param name="per_cell" value="true" />
            </section>
            <section name="umi">
                <param name="method" value="unique" />
            </section>
            <conditional name="cond_extra" >
                <param name="prepender" value="dataset name" />
            </conditional>
            <output name="out_counts" value="fc.ENSDARG00000019692.counts.name" />
        </test>
    </tests>
    <help><![CDATA[

count - Count reads per gene from BAM using UMIs and mapping coordinates
========================================================================

This tool is only designed to work with library preparation
methods where the fragmentation occurs after amplification, as per
most single cell RNA-Seq methods (e.g 10x, inDrop, Drop-seq, SCRB-seq
and CEL-seq2). Since the precise mapping co-ordinate is not longer
informative for such library preparations, it is simplified to the
gene. This is a reasonable approach providing the number of available
UMIs is sufficiently high and the sequencing depth is sufficiently low
that the probability of two reads from the same gene having the same
UMIs is acceptably low.

If you want to count reads per gene for library preparations which
fragment prior to amplification (e.g bulk RNA-Seq), please use
``umi_tools dedup`` to remove the duplicate reads as this will use the
full information from the mapping co-ordinate. Then use a read
counting tool such as FeatureCounts or HTSeq to count the reads per
gene.

In the rare case of bulk RNA-Seq using a library preparation method
with fragmentation after amplification, one can still use ``count`` but
note that it has not been tested on bulk RNA-Seq.

This tool deviates from group and dedup in that the ``--per-gene`` option
is hardcoded on.

@BARCODE_HELP@

@UMI_GROUPING_HELP@

    ]]></help>
    <expand macro="citations" />
</tool>