Mercurial > repos > iuc > umi_tools_count
diff umi-tools_counts.xml @ 8:e654095ab143 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit bf6a3aa532e8f9d122da4c1e39f3e256ae587b79"
author | iuc |
---|---|
date | Mon, 13 Sep 2021 14:49:44 +0000 |
parents | 276b4111b253 |
children | 71ad4a56c40c |
line wrap: on
line diff
--- a/umi-tools_counts.xml Wed Feb 10 19:29:55 2021 +0000 +++ b/umi-tools_counts.xml Mon Sep 13 14:49:44 2021 +0000 @@ -1,115 +1,44 @@ -<tool id="umi_tools_count" name="UMI-tools count" version="@VERSION@.1"> +<tool id="umi_tools_count" name="UMI-tools count" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> <description>performs quantification of UMIs from BAM files</description> + <expand macro="bio_tools"/> <macros> <import>macros.xml</import> - <xml name="sanitize_tag" > - <sanitizer invalid_char=""> - <valid initial="string.letters,string.digits" /> - </sanitizer> - </xml> </macros> <expand macro="requirements"> + <!-- TODO see comment in LINK_SAM_BAM_INPUT --> + <requirement type="package" version="1.12">samtools</requirement> <requirement type="package" version="4.7">sed</requirement> </expand> <command detect_errors="exit_code"><![CDATA[ -#import re - -ln -s '${input_bam}' 'input.bam' && -ln -s '${input_bam.metadata.bam_index}' 'input.bam.bai' && + #import re + @LINK_SAM_BAM_INPUT@ -umi_tools count - -I input.bam - '$paired' - --extract-umi-method='$barcodes.extract_umi_method.value' -#if str($barcodes.extract_umi_method) == 'read_id': - --umi-separator='$barcodes.umi_separator.value' -#else if str($barcodes.extract_umi_method) == 'tag': - --umi-tag='$barcodes.umi_tag.value' - --cell-tag='$barcodes.cell_tag.value' -#end if - --method='$method.value' - --edit-distance-threshold='$edit_distance_threshold' - --mapping-quality='$advanced.mapping_quality' - --per-gene - '$wide_format_cell_counts' - '$advanced.per_contig' - '$advanced.per_cell' - -#if str($advanced.gene_tag) != "": - --gene-tag='$advanced.gene_tag.value' -#end if -#if str($advanced.skip_tags_regex) != "": - --skip-tags-regex='$advanced.skip_tags_regex.value' -#end if -#if '$advanced.random_seed' != 0: - --random-seed='$advanced.random_seed' -#end if - -S '$out_counts' - - -#if str($cond_extra.prepender) != "none": -#set $replacer = re.sub('[^\w\_]+', '_', str($input_bam.element_identifier.rsplit('.',1)[0])) - #if str($cond_extra.prepender) == "string": -#set $replacer = str($cond_extra.custom_label) - #end if - -&& sed -i -r '1s|\b([ACGT]+)\b|'"$replacer"'_\1|g' '$out_counts' -#end if - + umi_tools count + '$wide_format_cell_counts' + @BARCODE_OPTIONS@ + @UMI_GROUPING_OPTIONS@ + @SC_OPTIONS@ + @SAMBAM_OPTIONS@ + @ADVANCED_OPTIONS@ + -I '$input_file' -S '$out_counts' + @LOG@ + #if str($cond_extra.prepender) != "none": + #if str($cond_extra.prepender) == "string": + #set $replacer = str($cond_extra.custom_label) + #else + #set $replacer = re.sub('[^\w\_]+', '_', str($input.element_identifier.rsplit('.',1)[0])) + #end if + && sed -i -r '1s|\b([ACGT]+)\b|'"$replacer"'_\1|g' '$out_counts' + #end if ]]></command> <inputs> - <param name="input_bam" type="data" format="bam" label="Sorted BAM file" help="Please use the samtools sort tool to ensure a correct BAM input" /> - <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" checked="false" label="Bam is paired-end" help="both read pairs will be output. This will also force the use of the template length to determine reads with the same mapping coordinates." /> - <conditional name="barcodes" > - <param argument="--extract-umi-method" name="extract_umi_method" type="select" label="Umi Extract Method" help="How are the barcodes encoded in the read?" > - <option value="read_id" selected="true">Barcodes are contained at the end of the read seperated by a delimiter</option> - <option value="tag" >Barcodes are contained in tags</option> - <option value="umis" >Barcodes were extracted using umis</option> - </param> - <when value="read_id" > - <param argument="--umi-separator" name="umi_separator" type="text" label="Delimiter between read id and the UMI" value="_" > - <sanitizer invalid_char="" > - <valid initial="string.punctuation" /> - </sanitizer> - </param> - </when> - <when value="tag" > - <param argument="--umi-tag" name="umi_tag" type="text" label="Tag which contains the UMI" > - <expand macro="sanitize_tag" /> - </param> - <param argument="--cell-tag" name="cell_tag" type="text" label="Tag which contains the cell barcode" > - <expand macro="sanitize_tag" /> - </param> - </when> - <when value="umis"></when> - </conditional> - <param argument="--method" type="select" label="Method to identify group of reads" help="UMIs with the same (or similar) codes can be grouped together. The simplest methods 'unique' and 'percentile' group identical -UMIs, however 'cluster', 'adjacency', and 'directional' can group similar umis with edit distances less than some threshold. Unique: Reads group share the exact same UMI. Percentile: Reads group share the same UMI, and UMIs with -counts < 1% of the median counts for UMIs at the same position are ignored. Cluster: Identify clusters of connected UMIs (based on hamming distance threshold). Adjacency: Same as cluster, but considers only directly ajacent UMIs in the cluster. Directional: Identify cluster of connected UMIs based on hamming distance and umi." > - <option value="unique" >Unique</option> - <option value="percentile">Percentile</option> - <option value="cluster">Cluster</option> - <option value="adjacency">Adjacency</option> - <option value="directional" selected="true" >Directional</option> - </param> - <param argument="--edit-distance-threshold" name="edit_distance_threshold" type="integer" label="Edit distance threshold" min="0" value="1" /> + <param name="input" type="data" format="sam,bam" label="Reads to deduplicate in SAM or BAM format" help="Please use the samtools sort tool to ensure a correct BAM input" /> <param argument="--wide-format-cell-counts" name="wide_format_cell_counts" type="boolean" truevalue="--wide-format-cell-counts" falsevalue="" checked="true" label="Output a matrix of genes and cells, instead of a flat file" /> - <section name="advanced" title="Extra parameters" > - <param argument="--mapping-quality" name="mapping_quality" type="integer" min="0" value="0" label="Minimum mapping quality" /> - <!-- Currently hard-coded parameter. Leave here if useful to future wrapper --> - <!-- <param argument="-\-per-gene" name="per_gene" type="text" label="Group reads together if they have the same gene" help="Reads will be grouped together if they have the same gene. This is useful if your library -prep generates PCR duplicates with non-identical alignment positions such as CEL-Seq. Note this option is hardcoded to be on with the count command. I.e counting is always performed per-gene. Must be combined with either --\-gene-tag or -\-per-contig option" /> --> - <param argument="--gene-tag" name="gene_tag" type="text" label="Deduplicate per gene." value="XT" help="The gene information is encoded in the bam read tag." > - <expand macro="sanitize_tag" /> - </param> - <param argument="--skip-tags-regex" name="skip_tags_regex" type="text" label="Skip any reads where the gene matches this tag" value="" > - <expand macro="barcode_sanitizer" /> - </param> - <param argument="--per-contig" name="per_contig" type="boolean" truevalue="--per-contig" falsevalue="" checked="false" label="Deduplicate per contig (field 3 in BAM; RNAME)" help="All reads with the same contig will be considered to have the same alignment position. This is useful if you have aligned to a reference transcriptome with one transcript per gene." /> - <param argument="--per-cell" name="per_cell" type="boolean" truevalue="--per-cell" falsevalue="" checked="true" label="Group reads only if they have the same cell barcode." /> - <param argument="--random-seed" name="random_seed" type="integer" min="0" value="0" label="Random Seed" /> - </section> + <expand macro="barcode_options_macro"/> + <expand macro="umi_grouping_options_macro"/> + <expand macro="sambam_options_macro"/> + <expand macro="sc_options_macro"/> + <expand macro="advanced_options_macro"/> <conditional name="cond_extra" > <param name="prepender" type="select" label="Prepend a label to all column headers" help="This preserves uniqueness when merging with other files with the same headers. Note: filename must not contain a '.' character" > <option value="none" selected="true" >No modifications</option> @@ -130,52 +59,115 @@ </param> </when> </conditional> + <expand macro="log_input_macro"/> </inputs> <outputs> <data name="out_counts" format="tabular" /> + <expand macro="log_output_macro"/> </outputs> <tests> <test><!--count_single_gene_tag:--> - <param name="input_bam" value="chr19_gene_tags.bam" /> - <param name="random_seed" value="123456789" /> - <param name="method" value="directional" /> - <param name="gene_tag" value="XF" /> - <param name="skip_tags_regex" value="^[__|Unassigned]" /> - <param name="extract_umi_method" value="umis" /> + <param name="input" value="chr19_gene_tags.bam" /> + <section name="advanced"> + <param name="random_seed" value="123456789" /> + </section> + <section name="sc"> + <param name="gene_tag" value="XF" /> + <param name="skip_tags_regex" value="^[__|Unassigned]" /> + <param name="per_cell" value="false" /> + </section> + <conditional name="bc"> + <param name="extract_umi_method" value="umis" /> + </conditional> + <section name="umi"> + <param name="method" value="directional" /> + </section> <param name="wide_format_cell_counts" value="false" /> - <param name="per_cell" value="false" /> + <output name="out_counts" value="count_single_gene_tag.tsv" /> + </test> + <test><!--count_single_gene_tag .. with sam input--> + <param name="input" value="chr19_gene_tags.sam" /> + <section name="advanced"> + <param name="random_seed" value="123456789" /> + </section> + <section name="sc"> + <param name="gene_tag" value="XF" /> + <param name="skip_tags_regex" value="^[__|Unassigned]" /> + <param name="per_cell" value="false" /> + </section> + <conditional name="bc"> + <param name="extract_umi_method" value="umis" /> + </conditional> + <section name="umi"> + <param name="method" value="directional" /> + </section> + <param name="wide_format_cell_counts" value="false" /> <output name="out_counts" value="count_single_gene_tag.tsv" /> </test> <test><!--count_single_cells_gene_tag:--> - <param name="input_bam" value="chr19_gene_tags.bam" /> - <param name="random_seed" value="123456789" /> - <param name="method" value="directional" /> - <param name="gene_tag" value="XF" /> - <param name="skip_tags_regex" value="^[__|Unassigned]" /> - <param name="per_cell" value="true" /> - <param name="extract_umi_method" value="umis" /> + <param name="input" value="chr19_gene_tags.bam" /> + <section name="advanced"> + <param name="random_seed" value="123456789" /> + </section> + <section name="sc"> + <param name="gene_tag" value="XF" /> + <param name="skip_tags_regex" value="^[__|Unassigned]" /> + <param name="per_cell" value="true" /> + </section> + <conditional name="bc"> + <param name="extract_umi_method" value="umis" /> + </conditional> + <section name="umi"> + <param name="method" value="directional" /> + </section> <param name="wide_format_cell_counts" value="false" /> <output name="out_counts" value="count_single_cells_gene_tag.tsv" /> </test> <test><!--count_single_cells_wide_gene_tag:--> - <param name="input_bam" value="chr19_gene_tags.bam" /> - <param name="random_seed" value="123456789" /> - <param name="method" value="directional" /> - <param name="gene_tag" value="XF" /> - <param name="skip_tags_regex" value="^[__|Unassigned]" /> - <param name="per_cell" value="true" /> - <param name="extract_umi_method" value="umis" /> + <param name="input" value="chr19_gene_tags.bam" /> + <section name="advanced"> + <param name="random_seed" value="123456789" /> + </section> + <section name="sc"> + <param name="gene_tag" value="XF" /> + <param name="skip_tags_regex" value="^[__|Unassigned]" /> + <param name="per_cell" value="true" /> + </section> + <conditional name="bc"> + <param name="extract_umi_method" value="umis" /> + </conditional> + <section name="umi"> + <param name="method" value="directional" /> + </section> <param name="wide_format_cell_counts" value="true" /> <output name="out_counts" value="count_single_cells_gene_tag_wide.tsv" /> </test> <test><!-- count ENSDARG00000019692, with defaults --> - <param name="input_bam" value="fc.ENSDARG00000019692.bam" /> - <param name="method" value="unique" /> + <param name="input" value="fc.ENSDARG00000019692.bam" /> + <section name="advanced"> + <param name="random_seed" value="0" /> + </section> + <section name="sc"> + <param name="gene_tag" value="XT" /> + <param name="per_cell" value="true" /> + </section> + <section name="umi"> + <param name="method" value="unique" /> + </section> <output name="out_counts" value="fc.ENSDARG00000019692.counts" /> </test> <test><!-- count ENSDARG00000019692, relabel string --> - <param name="input_bam" value="fc.ENSDARG00000019692.bam" /> - <param name="method" value="unique" /> + <param name="input" value="fc.ENSDARG00000019692.bam" /> + <section name="advanced"> + <param name="random_seed" value="0" /> + </section> + <section name="sc"> + <param name="gene_tag" value="XT" /> + <param name="per_cell" value="true" /> + </section> + <section name="umi"> + <param name="method" value="unique" /> + </section> <conditional name="cond_extra" > <param name="prepender" value="string" /> <param name="custom_label" value="test" /> @@ -183,8 +175,17 @@ <output name="out_counts" value="fc.ENSDARG00000019692.counts.test" /> </test> <test><!-- count ENSDARG00000019692, relabel filename --> - <param name="input_bam" value="fc.ENSDARG00000019692.bam" /> - <param name="method" value="unique" /> + <param name="input" value="fc.ENSDARG00000019692.bam" /> + <section name="advanced"> + <param name="random_seed" value="0" /> + </section> + <section name="sc"> + <param name="gene_tag" value="XT" /> + <param name="per_cell" value="true" /> + </section> + <section name="umi"> + <param name="method" value="unique" /> + </section> <conditional name="cond_extra" > <param name="prepender" value="dataset name" /> </conditional> @@ -193,26 +194,36 @@ </tests> <help><![CDATA[ -UMI Tools count - Count reads per gene from BAM using UMIs ----------------------------------------------------------- +count - Count reads per gene from BAM using UMIs and mapping coordinates +======================================================================== -Purpose -------- - -The purpose of this command is to count the number of reads per gene based -on the mapping co-ordinate and the UMI attached to the read. - +This tool is only designed to work with library preparation +methods where the fragmentation occurs after amplification, as per +most single cell RNA-Seq methods (e.g 10x, inDrop, Drop-seq, SCRB-seq +and CEL-seq2). Since the precise mapping co-ordinate is not longer +informative for such library preparations, it is simplified to the +gene. This is a reasonable approach providing the number of available +UMIs is sufficiently high and the sequencing depth is sufficiently low +that the probability of two reads from the same gene having the same +UMIs is acceptably low. -It is assumed that the FASTQ files were processed with extract_umi.py -before mapping and thus the UMI is the last word of the read name. e.g: - -@HISEQ:87:00000000_AATT +If you want to count reads per gene for library preparations which +fragment prior to amplification (e.g bulk RNA-Seq), please use +``umi_tools dedup`` to remove the duplicate reads as this will use the +full information from the mapping co-ordinate. Then use a read +counting tool such as FeatureCounts or HTSeq to count the reads per +gene. -where AATT is the UMI sequeuence. +In the rare case of bulk RNA-Seq using a library preparation method +with fragmentation after amplification, one can still use ``count`` but +note that it has not been tested on bulk RNA-Seq. -If you have used an alternative method which does not separate the -read id and UMI with a "_", such as bcl2fastq which uses ":", you can -specify the separator, or if your UMIs are encoded in a tag you can also specify this. +This tool deviates from group and dedup in that the ``--per-gene`` option +is hardcoded on. + +@BARCODE_HELP@ + +@UMI_GROUPING_HELP@ ]]></help> <expand macro="citations" />