diff umi-tools_counts.xml @ 8:e654095ab143 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit bf6a3aa532e8f9d122da4c1e39f3e256ae587b79"
author iuc
date Mon, 13 Sep 2021 14:49:44 +0000
parents 276b4111b253
children 71ad4a56c40c
line wrap: on
line diff
--- a/umi-tools_counts.xml	Wed Feb 10 19:29:55 2021 +0000
+++ b/umi-tools_counts.xml	Mon Sep 13 14:49:44 2021 +0000
@@ -1,115 +1,44 @@
-<tool id="umi_tools_count" name="UMI-tools count" version="@VERSION@.1">
+<tool id="umi_tools_count" name="UMI-tools count" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
     <description>performs quantification of UMIs from BAM files</description>
+    <expand macro="bio_tools"/>
     <macros>
         <import>macros.xml</import>
-        <xml name="sanitize_tag" >
-            <sanitizer invalid_char="">
-                <valid initial="string.letters,string.digits" />
-            </sanitizer>
-        </xml>
     </macros>
     <expand macro="requirements">
+        <!-- TODO see comment in LINK_SAM_BAM_INPUT -->
+        <requirement type="package" version="1.12">samtools</requirement>
         <requirement type="package" version="4.7">sed</requirement>
     </expand>
     <command detect_errors="exit_code"><![CDATA[
-#import re
-
-ln -s '${input_bam}' 'input.bam' &&
-ln -s '${input_bam.metadata.bam_index}' 'input.bam.bai' &&
+        #import re
+        @LINK_SAM_BAM_INPUT@        
 
-umi_tools count
-    -I input.bam
-    '$paired'
-    --extract-umi-method='$barcodes.extract_umi_method.value'
-#if str($barcodes.extract_umi_method) == 'read_id':
-    --umi-separator='$barcodes.umi_separator.value'
-#else if str($barcodes.extract_umi_method) == 'tag':
-    --umi-tag='$barcodes.umi_tag.value'
-    --cell-tag='$barcodes.cell_tag.value'
-#end if
-    --method='$method.value'
-    --edit-distance-threshold='$edit_distance_threshold'
-    --mapping-quality='$advanced.mapping_quality'
-    --per-gene
-    '$wide_format_cell_counts'
-    '$advanced.per_contig'
-    '$advanced.per_cell'
-
-#if str($advanced.gene_tag) != "":
-    --gene-tag='$advanced.gene_tag.value'
-#end if
-#if str($advanced.skip_tags_regex) != "":
-    --skip-tags-regex='$advanced.skip_tags_regex.value'
-#end if
-#if '$advanced.random_seed' != 0:
-    --random-seed='$advanced.random_seed'
-#end if
-    -S '$out_counts'
-
-
-#if str($cond_extra.prepender) != "none":
-#set $replacer = re.sub('[^\w\_]+', '_', str($input_bam.element_identifier.rsplit('.',1)[0]))
-    #if str($cond_extra.prepender) == "string":
-#set $replacer = str($cond_extra.custom_label)
-    #end if
-
-&& sed -i -r '1s|\b([ACGT]+)\b|'"$replacer"'_\1|g' '$out_counts'
-#end if
-
+        umi_tools count
+            '$wide_format_cell_counts'
+            @BARCODE_OPTIONS@
+            @UMI_GROUPING_OPTIONS@
+            @SC_OPTIONS@
+            @SAMBAM_OPTIONS@
+            @ADVANCED_OPTIONS@
+            -I '$input_file' -S '$out_counts'
+            @LOG@
+        #if str($cond_extra.prepender) != "none":
+            #if str($cond_extra.prepender) == "string":
+                #set $replacer = str($cond_extra.custom_label)
+            #else
+                #set $replacer = re.sub('[^\w\_]+', '_', str($input.element_identifier.rsplit('.',1)[0]))
+            #end if
+            && sed -i -r '1s|\b([ACGT]+)\b|'"$replacer"'_\1|g' '$out_counts'
+        #end if
     ]]></command>
     <inputs>
-        <param name="input_bam" type="data" format="bam" label="Sorted BAM file" help="Please use the samtools sort tool to ensure a correct BAM input" />
-        <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" checked="false" label="Bam is paired-end" help="both read pairs will be output. This will also force the use of the template length to determine reads with the same mapping coordinates." />
-        <conditional name="barcodes" >
-            <param argument="--extract-umi-method" name="extract_umi_method" type="select" label="Umi Extract Method" help="How are the barcodes encoded in the read?" >
-                <option value="read_id" selected="true">Barcodes are contained at the end of the read seperated by a delimiter</option>
-                <option value="tag" >Barcodes are contained in tags</option>
-                <option value="umis" >Barcodes were extracted using umis</option>
-            </param>
-            <when value="read_id" >
-                <param argument="--umi-separator" name="umi_separator" type="text" label="Delimiter between read id and the UMI" value="_" >
-                    <sanitizer invalid_char="" >
-                        <valid initial="string.punctuation" />
-                    </sanitizer>
-                </param>
-            </when>
-            <when value="tag" >
-                <param argument="--umi-tag" name="umi_tag" type="text" label="Tag which contains the UMI" >
-                    <expand macro="sanitize_tag" />
-                </param>
-                <param argument="--cell-tag" name="cell_tag" type="text" label="Tag which contains the cell barcode" >
-                    <expand macro="sanitize_tag" />
-                </param>
-            </when>
-            <when value="umis"></when>
-        </conditional>
-        <param argument="--method"  type="select" label="Method to identify group of reads" help="UMIs with the same (or similar) codes can be grouped together. The simplest methods 'unique' and 'percentile' group identical
-UMIs, however 'cluster', 'adjacency', and 'directional' can group similar umis with edit distances less than some threshold. Unique: Reads group share the exact same UMI. Percentile: Reads group share the same UMI, and UMIs with
-counts &lt; 1% of the median counts for UMIs at the same position are ignored. Cluster: Identify clusters of connected UMIs (based on hamming distance threshold). Adjacency: Same as cluster, but considers only directly ajacent UMIs in the cluster. Directional: Identify cluster of connected UMIs based on hamming distance and umi." >
-            <option value="unique" >Unique</option>
-            <option value="percentile">Percentile</option>
-            <option value="cluster">Cluster</option>
-            <option value="adjacency">Adjacency</option>
-            <option value="directional" selected="true" >Directional</option>
-        </param>
-        <param argument="--edit-distance-threshold" name="edit_distance_threshold" type="integer" label="Edit distance threshold" min="0" value="1" />
+        <param name="input" type="data" format="sam,bam" label="Reads to deduplicate in SAM or BAM format" help="Please use the samtools sort tool to ensure a correct BAM input" />
         <param argument="--wide-format-cell-counts" name="wide_format_cell_counts" type="boolean" truevalue="--wide-format-cell-counts" falsevalue="" checked="true" label="Output a matrix of genes and cells, instead of a flat file" />
-        <section name="advanced" title="Extra parameters" >
-            <param argument="--mapping-quality" name="mapping_quality" type="integer" min="0" value="0" label="Minimum mapping quality" />
-            <!-- Currently hard-coded parameter. Leave here if useful to future wrapper  -->
-            <!-- <param argument="-\-per-gene" name="per_gene" type="text" label="Group reads together if they have the same gene" help="Reads will be grouped together if they have the same gene. This is useful if your library
-prep generates PCR duplicates with non-identical alignment positions such as CEL-Seq. Note this option is hardcoded to be on with the count command. I.e counting is always performed per-gene. Must be combined with either
--\-gene-tag or -\-per-contig option" /> -->
-            <param argument="--gene-tag" name="gene_tag" type="text" label="Deduplicate per gene." value="XT" help="The gene information is encoded in the bam read tag." >
-                <expand macro="sanitize_tag" />
-            </param>
-            <param argument="--skip-tags-regex" name="skip_tags_regex" type="text" label="Skip any reads where the gene matches this tag" value="" >
-                <expand macro="barcode_sanitizer" />
-            </param>
-            <param argument="--per-contig" name="per_contig" type="boolean" truevalue="--per-contig" falsevalue="" checked="false" label="Deduplicate per contig (field 3 in BAM; RNAME)"  help="All reads with the same contig will be considered to have the same alignment position. This is useful if you have aligned to a reference transcriptome with one transcript per gene." />
-            <param argument="--per-cell" name="per_cell" type="boolean" truevalue="--per-cell" falsevalue="" checked="true" label="Group reads only if they have the same cell barcode." />
-            <param argument="--random-seed" name="random_seed" type="integer" min="0" value="0" label="Random Seed" />
-        </section>
+        <expand macro="barcode_options_macro"/>
+        <expand macro="umi_grouping_options_macro"/>
+        <expand macro="sambam_options_macro"/>
+        <expand macro="sc_options_macro"/>
+        <expand macro="advanced_options_macro"/>
         <conditional name="cond_extra" >
             <param name="prepender" type="select" label="Prepend a label to all column headers" help="This preserves uniqueness when merging with other files with the same headers. Note: filename must not contain a '.' character" >
                 <option value="none" selected="true" >No modifications</option>
@@ -130,52 +59,115 @@
                 </param>
             </when>
         </conditional>
+        <expand macro="log_input_macro"/>
     </inputs>
     <outputs>
         <data name="out_counts" format="tabular" />
+        <expand macro="log_output_macro"/>
     </outputs>
     <tests>
         <test><!--count_single_gene_tag:-->
-            <param name="input_bam" value="chr19_gene_tags.bam" />
-            <param name="random_seed" value="123456789" />
-            <param name="method" value="directional" />
-            <param name="gene_tag" value="XF" />
-            <param name="skip_tags_regex" value="^[__|Unassigned]" />
-            <param name="extract_umi_method" value="umis" />
+            <param name="input" value="chr19_gene_tags.bam" />
+            <section name="advanced">
+                <param name="random_seed" value="123456789" />
+            </section>
+            <section name="sc">
+                <param name="gene_tag" value="XF" />
+                <param name="skip_tags_regex" value="^[__|Unassigned]" />
+                <param name="per_cell" value="false" />
+            </section>
+            <conditional name="bc">
+                <param name="extract_umi_method" value="umis" />
+            </conditional>
+            <section name="umi">
+                <param name="method" value="directional" />
+            </section>
             <param name="wide_format_cell_counts" value="false" />
-            <param name="per_cell" value="false" />
+            <output name="out_counts" value="count_single_gene_tag.tsv" />
+        </test>
+        <test><!--count_single_gene_tag .. with sam input-->
+            <param name="input" value="chr19_gene_tags.sam" />
+            <section name="advanced">
+                <param name="random_seed" value="123456789" />
+            </section>
+            <section name="sc">
+                <param name="gene_tag" value="XF" />
+                <param name="skip_tags_regex" value="^[__|Unassigned]" />
+                <param name="per_cell" value="false" />
+            </section>
+            <conditional name="bc">
+                <param name="extract_umi_method" value="umis" />
+            </conditional>
+            <section name="umi">
+                <param name="method" value="directional" />
+            </section>
+            <param name="wide_format_cell_counts" value="false" />
             <output name="out_counts" value="count_single_gene_tag.tsv" />
         </test>
         <test><!--count_single_cells_gene_tag:-->
-            <param name="input_bam" value="chr19_gene_tags.bam" />
-            <param name="random_seed" value="123456789" />
-            <param name="method" value="directional" />
-            <param name="gene_tag" value="XF" />
-            <param name="skip_tags_regex" value="^[__|Unassigned]" />
-            <param name="per_cell" value="true" />
-            <param name="extract_umi_method" value="umis" />
+            <param name="input" value="chr19_gene_tags.bam" />
+            <section name="advanced">
+                <param name="random_seed" value="123456789" />
+            </section>
+            <section name="sc">
+                <param name="gene_tag" value="XF" />
+                <param name="skip_tags_regex" value="^[__|Unassigned]" />
+                <param name="per_cell" value="true" />
+            </section>
+            <conditional name="bc">
+                <param name="extract_umi_method" value="umis" />
+            </conditional>
+            <section name="umi">
+                <param name="method" value="directional" />
+            </section>
             <param name="wide_format_cell_counts" value="false" />
             <output name="out_counts" value="count_single_cells_gene_tag.tsv" />
         </test>
         <test><!--count_single_cells_wide_gene_tag:-->
-            <param name="input_bam" value="chr19_gene_tags.bam" />
-            <param name="random_seed" value="123456789" />
-            <param name="method" value="directional" />
-            <param name="gene_tag" value="XF" />
-            <param name="skip_tags_regex" value="^[__|Unassigned]" />
-            <param name="per_cell" value="true" />
-            <param name="extract_umi_method" value="umis" />
+            <param name="input" value="chr19_gene_tags.bam" />
+            <section name="advanced">
+                <param name="random_seed" value="123456789" />
+            </section>
+            <section name="sc">
+                <param name="gene_tag" value="XF" />
+                <param name="skip_tags_regex" value="^[__|Unassigned]" />
+                <param name="per_cell" value="true" />
+            </section>
+            <conditional name="bc">
+                <param name="extract_umi_method" value="umis" />
+            </conditional>
+            <section name="umi">
+                <param name="method" value="directional" />
+            </section>
             <param name="wide_format_cell_counts" value="true" />
             <output name="out_counts" value="count_single_cells_gene_tag_wide.tsv" />
         </test>
         <test><!-- count ENSDARG00000019692, with defaults -->
-            <param name="input_bam" value="fc.ENSDARG00000019692.bam" />
-            <param name="method" value="unique" />
+            <param name="input" value="fc.ENSDARG00000019692.bam" />
+            <section name="advanced">
+                <param name="random_seed" value="0" />
+            </section>
+            <section name="sc">
+                <param name="gene_tag" value="XT" />
+                <param name="per_cell" value="true" />
+            </section>
+            <section name="umi">
+                <param name="method" value="unique" />
+            </section>
             <output name="out_counts" value="fc.ENSDARG00000019692.counts" />
         </test>
         <test><!-- count ENSDARG00000019692, relabel string -->
-            <param name="input_bam" value="fc.ENSDARG00000019692.bam" />
-            <param name="method" value="unique" />
+            <param name="input" value="fc.ENSDARG00000019692.bam" />
+            <section name="advanced">
+                <param name="random_seed" value="0" />
+            </section>
+            <section name="sc">
+                <param name="gene_tag" value="XT" />
+                <param name="per_cell" value="true" />
+            </section>
+            <section name="umi">
+                <param name="method" value="unique" />
+            </section>
             <conditional name="cond_extra" >
                 <param name="prepender" value="string" />
                 <param name="custom_label" value="test" />
@@ -183,8 +175,17 @@
             <output name="out_counts" value="fc.ENSDARG00000019692.counts.test" />
         </test>
         <test><!-- count ENSDARG00000019692, relabel filename -->
-            <param name="input_bam" value="fc.ENSDARG00000019692.bam" />
-            <param name="method" value="unique" />
+            <param name="input" value="fc.ENSDARG00000019692.bam" />
+            <section name="advanced">
+                <param name="random_seed" value="0" />
+            </section>
+            <section name="sc">
+                <param name="gene_tag" value="XT" />
+                <param name="per_cell" value="true" />
+            </section>
+            <section name="umi">
+                <param name="method" value="unique" />
+            </section>
             <conditional name="cond_extra" >
                 <param name="prepender" value="dataset name" />
             </conditional>
@@ -193,26 +194,36 @@
     </tests>
     <help><![CDATA[
 
-UMI Tools count - Count reads per gene from BAM using UMIs
-----------------------------------------------------------
+count - Count reads per gene from BAM using UMIs and mapping coordinates
+========================================================================
 
-Purpose
--------
-
-The purpose of this command is to count the number of reads per gene based
-on the mapping co-ordinate and the UMI attached to the read.
-
+This tool is only designed to work with library preparation
+methods where the fragmentation occurs after amplification, as per
+most single cell RNA-Seq methods (e.g 10x, inDrop, Drop-seq, SCRB-seq
+and CEL-seq2). Since the precise mapping co-ordinate is not longer
+informative for such library preparations, it is simplified to the
+gene. This is a reasonable approach providing the number of available
+UMIs is sufficiently high and the sequencing depth is sufficiently low
+that the probability of two reads from the same gene having the same
+UMIs is acceptably low.
 
-It is assumed that the FASTQ files were processed with extract_umi.py
-before mapping and thus the UMI is the last word of the read name. e.g:
-
-@HISEQ:87:00000000_AATT
+If you want to count reads per gene for library preparations which
+fragment prior to amplification (e.g bulk RNA-Seq), please use
+``umi_tools dedup`` to remove the duplicate reads as this will use the
+full information from the mapping co-ordinate. Then use a read
+counting tool such as FeatureCounts or HTSeq to count the reads per
+gene.
 
-where AATT is the UMI sequeuence.
+In the rare case of bulk RNA-Seq using a library preparation method
+with fragmentation after amplification, one can still use ``count`` but
+note that it has not been tested on bulk RNA-Seq.
 
-If you have used an alternative method which does not separate the
-read id and UMI with a "_", such as bcl2fastq which uses ":", you can
-specify the separator, or if your UMIs are encoded in a tag you can also specify this.
+This tool deviates from group and dedup in that the ``--per-gene`` option
+is hardcoded on.
+
+@BARCODE_HELP@
+
+@UMI_GROUPING_HELP@
 
     ]]></help>
     <expand macro="citations" />