Mercurial > repos > iuc > umi_tools_count
comparison umi-tools_counts.xml @ 8:e654095ab143 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit bf6a3aa532e8f9d122da4c1e39f3e256ae587b79"
author | iuc |
---|---|
date | Mon, 13 Sep 2021 14:49:44 +0000 |
parents | 276b4111b253 |
children | 71ad4a56c40c |
comparison
equal
deleted
inserted
replaced
7:8250ea3a1501 | 8:e654095ab143 |
---|---|
1 <tool id="umi_tools_count" name="UMI-tools count" version="@VERSION@.1"> | 1 <tool id="umi_tools_count" name="UMI-tools count" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> |
2 <description>performs quantification of UMIs from BAM files</description> | 2 <description>performs quantification of UMIs from BAM files</description> |
3 <expand macro="bio_tools"/> | |
3 <macros> | 4 <macros> |
4 <import>macros.xml</import> | 5 <import>macros.xml</import> |
5 <xml name="sanitize_tag" > | |
6 <sanitizer invalid_char=""> | |
7 <valid initial="string.letters,string.digits" /> | |
8 </sanitizer> | |
9 </xml> | |
10 </macros> | 6 </macros> |
11 <expand macro="requirements"> | 7 <expand macro="requirements"> |
8 <!-- TODO see comment in LINK_SAM_BAM_INPUT --> | |
9 <requirement type="package" version="1.12">samtools</requirement> | |
12 <requirement type="package" version="4.7">sed</requirement> | 10 <requirement type="package" version="4.7">sed</requirement> |
13 </expand> | 11 </expand> |
14 <command detect_errors="exit_code"><![CDATA[ | 12 <command detect_errors="exit_code"><![CDATA[ |
15 #import re | 13 #import re |
16 | 14 @LINK_SAM_BAM_INPUT@ |
17 ln -s '${input_bam}' 'input.bam' && | 15 |
18 ln -s '${input_bam.metadata.bam_index}' 'input.bam.bai' && | 16 umi_tools count |
19 | 17 '$wide_format_cell_counts' |
20 umi_tools count | 18 @BARCODE_OPTIONS@ |
21 -I input.bam | 19 @UMI_GROUPING_OPTIONS@ |
22 '$paired' | 20 @SC_OPTIONS@ |
23 --extract-umi-method='$barcodes.extract_umi_method.value' | 21 @SAMBAM_OPTIONS@ |
24 #if str($barcodes.extract_umi_method) == 'read_id': | 22 @ADVANCED_OPTIONS@ |
25 --umi-separator='$barcodes.umi_separator.value' | 23 -I '$input_file' -S '$out_counts' |
26 #else if str($barcodes.extract_umi_method) == 'tag': | 24 @LOG@ |
27 --umi-tag='$barcodes.umi_tag.value' | 25 #if str($cond_extra.prepender) != "none": |
28 --cell-tag='$barcodes.cell_tag.value' | 26 #if str($cond_extra.prepender) == "string": |
29 #end if | 27 #set $replacer = str($cond_extra.custom_label) |
30 --method='$method.value' | 28 #else |
31 --edit-distance-threshold='$edit_distance_threshold' | 29 #set $replacer = re.sub('[^\w\_]+', '_', str($input.element_identifier.rsplit('.',1)[0])) |
32 --mapping-quality='$advanced.mapping_quality' | 30 #end if |
33 --per-gene | 31 && sed -i -r '1s|\b([ACGT]+)\b|'"$replacer"'_\1|g' '$out_counts' |
34 '$wide_format_cell_counts' | 32 #end if |
35 '$advanced.per_contig' | |
36 '$advanced.per_cell' | |
37 | |
38 #if str($advanced.gene_tag) != "": | |
39 --gene-tag='$advanced.gene_tag.value' | |
40 #end if | |
41 #if str($advanced.skip_tags_regex) != "": | |
42 --skip-tags-regex='$advanced.skip_tags_regex.value' | |
43 #end if | |
44 #if '$advanced.random_seed' != 0: | |
45 --random-seed='$advanced.random_seed' | |
46 #end if | |
47 -S '$out_counts' | |
48 | |
49 | |
50 #if str($cond_extra.prepender) != "none": | |
51 #set $replacer = re.sub('[^\w\_]+', '_', str($input_bam.element_identifier.rsplit('.',1)[0])) | |
52 #if str($cond_extra.prepender) == "string": | |
53 #set $replacer = str($cond_extra.custom_label) | |
54 #end if | |
55 | |
56 && sed -i -r '1s|\b([ACGT]+)\b|'"$replacer"'_\1|g' '$out_counts' | |
57 #end if | |
58 | |
59 ]]></command> | 33 ]]></command> |
60 <inputs> | 34 <inputs> |
61 <param name="input_bam" type="data" format="bam" label="Sorted BAM file" help="Please use the samtools sort tool to ensure a correct BAM input" /> | 35 <param name="input" type="data" format="sam,bam" label="Reads to deduplicate in SAM or BAM format" help="Please use the samtools sort tool to ensure a correct BAM input" /> |
62 <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" checked="false" label="Bam is paired-end" help="both read pairs will be output. This will also force the use of the template length to determine reads with the same mapping coordinates." /> | |
63 <conditional name="barcodes" > | |
64 <param argument="--extract-umi-method" name="extract_umi_method" type="select" label="Umi Extract Method" help="How are the barcodes encoded in the read?" > | |
65 <option value="read_id" selected="true">Barcodes are contained at the end of the read seperated by a delimiter</option> | |
66 <option value="tag" >Barcodes are contained in tags</option> | |
67 <option value="umis" >Barcodes were extracted using umis</option> | |
68 </param> | |
69 <when value="read_id" > | |
70 <param argument="--umi-separator" name="umi_separator" type="text" label="Delimiter between read id and the UMI" value="_" > | |
71 <sanitizer invalid_char="" > | |
72 <valid initial="string.punctuation" /> | |
73 </sanitizer> | |
74 </param> | |
75 </when> | |
76 <when value="tag" > | |
77 <param argument="--umi-tag" name="umi_tag" type="text" label="Tag which contains the UMI" > | |
78 <expand macro="sanitize_tag" /> | |
79 </param> | |
80 <param argument="--cell-tag" name="cell_tag" type="text" label="Tag which contains the cell barcode" > | |
81 <expand macro="sanitize_tag" /> | |
82 </param> | |
83 </when> | |
84 <when value="umis"></when> | |
85 </conditional> | |
86 <param argument="--method" type="select" label="Method to identify group of reads" help="UMIs with the same (or similar) codes can be grouped together. The simplest methods 'unique' and 'percentile' group identical | |
87 UMIs, however 'cluster', 'adjacency', and 'directional' can group similar umis with edit distances less than some threshold. Unique: Reads group share the exact same UMI. Percentile: Reads group share the same UMI, and UMIs with | |
88 counts < 1% of the median counts for UMIs at the same position are ignored. Cluster: Identify clusters of connected UMIs (based on hamming distance threshold). Adjacency: Same as cluster, but considers only directly ajacent UMIs in the cluster. Directional: Identify cluster of connected UMIs based on hamming distance and umi." > | |
89 <option value="unique" >Unique</option> | |
90 <option value="percentile">Percentile</option> | |
91 <option value="cluster">Cluster</option> | |
92 <option value="adjacency">Adjacency</option> | |
93 <option value="directional" selected="true" >Directional</option> | |
94 </param> | |
95 <param argument="--edit-distance-threshold" name="edit_distance_threshold" type="integer" label="Edit distance threshold" min="0" value="1" /> | |
96 <param argument="--wide-format-cell-counts" name="wide_format_cell_counts" type="boolean" truevalue="--wide-format-cell-counts" falsevalue="" checked="true" label="Output a matrix of genes and cells, instead of a flat file" /> | 36 <param argument="--wide-format-cell-counts" name="wide_format_cell_counts" type="boolean" truevalue="--wide-format-cell-counts" falsevalue="" checked="true" label="Output a matrix of genes and cells, instead of a flat file" /> |
97 <section name="advanced" title="Extra parameters" > | 37 <expand macro="barcode_options_macro"/> |
98 <param argument="--mapping-quality" name="mapping_quality" type="integer" min="0" value="0" label="Minimum mapping quality" /> | 38 <expand macro="umi_grouping_options_macro"/> |
99 <!-- Currently hard-coded parameter. Leave here if useful to future wrapper --> | 39 <expand macro="sambam_options_macro"/> |
100 <!-- <param argument="-\-per-gene" name="per_gene" type="text" label="Group reads together if they have the same gene" help="Reads will be grouped together if they have the same gene. This is useful if your library | 40 <expand macro="sc_options_macro"/> |
101 prep generates PCR duplicates with non-identical alignment positions such as CEL-Seq. Note this option is hardcoded to be on with the count command. I.e counting is always performed per-gene. Must be combined with either | 41 <expand macro="advanced_options_macro"/> |
102 -\-gene-tag or -\-per-contig option" /> --> | |
103 <param argument="--gene-tag" name="gene_tag" type="text" label="Deduplicate per gene." value="XT" help="The gene information is encoded in the bam read tag." > | |
104 <expand macro="sanitize_tag" /> | |
105 </param> | |
106 <param argument="--skip-tags-regex" name="skip_tags_regex" type="text" label="Skip any reads where the gene matches this tag" value="" > | |
107 <expand macro="barcode_sanitizer" /> | |
108 </param> | |
109 <param argument="--per-contig" name="per_contig" type="boolean" truevalue="--per-contig" falsevalue="" checked="false" label="Deduplicate per contig (field 3 in BAM; RNAME)" help="All reads with the same contig will be considered to have the same alignment position. This is useful if you have aligned to a reference transcriptome with one transcript per gene." /> | |
110 <param argument="--per-cell" name="per_cell" type="boolean" truevalue="--per-cell" falsevalue="" checked="true" label="Group reads only if they have the same cell barcode." /> | |
111 <param argument="--random-seed" name="random_seed" type="integer" min="0" value="0" label="Random Seed" /> | |
112 </section> | |
113 <conditional name="cond_extra" > | 42 <conditional name="cond_extra" > |
114 <param name="prepender" type="select" label="Prepend a label to all column headers" help="This preserves uniqueness when merging with other files with the same headers. Note: filename must not contain a '.' character" > | 43 <param name="prepender" type="select" label="Prepend a label to all column headers" help="This preserves uniqueness when merging with other files with the same headers. Note: filename must not contain a '.' character" > |
115 <option value="none" selected="true" >No modifications</option> | 44 <option value="none" selected="true" >No modifications</option> |
116 <option value="string">Custom Label</option> | 45 <option value="string">Custom Label</option> |
117 <option value="dataset name">Dataset Name</option> | 46 <option value="dataset name">Dataset Name</option> |
128 </valid> | 57 </valid> |
129 </sanitizer> | 58 </sanitizer> |
130 </param> | 59 </param> |
131 </when> | 60 </when> |
132 </conditional> | 61 </conditional> |
62 <expand macro="log_input_macro"/> | |
133 </inputs> | 63 </inputs> |
134 <outputs> | 64 <outputs> |
135 <data name="out_counts" format="tabular" /> | 65 <data name="out_counts" format="tabular" /> |
66 <expand macro="log_output_macro"/> | |
136 </outputs> | 67 </outputs> |
137 <tests> | 68 <tests> |
138 <test><!--count_single_gene_tag:--> | 69 <test><!--count_single_gene_tag:--> |
139 <param name="input_bam" value="chr19_gene_tags.bam" /> | 70 <param name="input" value="chr19_gene_tags.bam" /> |
140 <param name="random_seed" value="123456789" /> | 71 <section name="advanced"> |
141 <param name="method" value="directional" /> | 72 <param name="random_seed" value="123456789" /> |
142 <param name="gene_tag" value="XF" /> | 73 </section> |
143 <param name="skip_tags_regex" value="^[__|Unassigned]" /> | 74 <section name="sc"> |
144 <param name="extract_umi_method" value="umis" /> | 75 <param name="gene_tag" value="XF" /> |
76 <param name="skip_tags_regex" value="^[__|Unassigned]" /> | |
77 <param name="per_cell" value="false" /> | |
78 </section> | |
79 <conditional name="bc"> | |
80 <param name="extract_umi_method" value="umis" /> | |
81 </conditional> | |
82 <section name="umi"> | |
83 <param name="method" value="directional" /> | |
84 </section> | |
145 <param name="wide_format_cell_counts" value="false" /> | 85 <param name="wide_format_cell_counts" value="false" /> |
146 <param name="per_cell" value="false" /> | |
147 <output name="out_counts" value="count_single_gene_tag.tsv" /> | 86 <output name="out_counts" value="count_single_gene_tag.tsv" /> |
148 </test> | 87 </test> |
88 <test><!--count_single_gene_tag .. with sam input--> | |
89 <param name="input" value="chr19_gene_tags.sam" /> | |
90 <section name="advanced"> | |
91 <param name="random_seed" value="123456789" /> | |
92 </section> | |
93 <section name="sc"> | |
94 <param name="gene_tag" value="XF" /> | |
95 <param name="skip_tags_regex" value="^[__|Unassigned]" /> | |
96 <param name="per_cell" value="false" /> | |
97 </section> | |
98 <conditional name="bc"> | |
99 <param name="extract_umi_method" value="umis" /> | |
100 </conditional> | |
101 <section name="umi"> | |
102 <param name="method" value="directional" /> | |
103 </section> | |
104 <param name="wide_format_cell_counts" value="false" /> | |
105 <output name="out_counts" value="count_single_gene_tag.tsv" /> | |
106 </test> | |
149 <test><!--count_single_cells_gene_tag:--> | 107 <test><!--count_single_cells_gene_tag:--> |
150 <param name="input_bam" value="chr19_gene_tags.bam" /> | 108 <param name="input" value="chr19_gene_tags.bam" /> |
151 <param name="random_seed" value="123456789" /> | 109 <section name="advanced"> |
152 <param name="method" value="directional" /> | 110 <param name="random_seed" value="123456789" /> |
153 <param name="gene_tag" value="XF" /> | 111 </section> |
154 <param name="skip_tags_regex" value="^[__|Unassigned]" /> | 112 <section name="sc"> |
155 <param name="per_cell" value="true" /> | 113 <param name="gene_tag" value="XF" /> |
156 <param name="extract_umi_method" value="umis" /> | 114 <param name="skip_tags_regex" value="^[__|Unassigned]" /> |
115 <param name="per_cell" value="true" /> | |
116 </section> | |
117 <conditional name="bc"> | |
118 <param name="extract_umi_method" value="umis" /> | |
119 </conditional> | |
120 <section name="umi"> | |
121 <param name="method" value="directional" /> | |
122 </section> | |
157 <param name="wide_format_cell_counts" value="false" /> | 123 <param name="wide_format_cell_counts" value="false" /> |
158 <output name="out_counts" value="count_single_cells_gene_tag.tsv" /> | 124 <output name="out_counts" value="count_single_cells_gene_tag.tsv" /> |
159 </test> | 125 </test> |
160 <test><!--count_single_cells_wide_gene_tag:--> | 126 <test><!--count_single_cells_wide_gene_tag:--> |
161 <param name="input_bam" value="chr19_gene_tags.bam" /> | 127 <param name="input" value="chr19_gene_tags.bam" /> |
162 <param name="random_seed" value="123456789" /> | 128 <section name="advanced"> |
163 <param name="method" value="directional" /> | 129 <param name="random_seed" value="123456789" /> |
164 <param name="gene_tag" value="XF" /> | 130 </section> |
165 <param name="skip_tags_regex" value="^[__|Unassigned]" /> | 131 <section name="sc"> |
166 <param name="per_cell" value="true" /> | 132 <param name="gene_tag" value="XF" /> |
167 <param name="extract_umi_method" value="umis" /> | 133 <param name="skip_tags_regex" value="^[__|Unassigned]" /> |
134 <param name="per_cell" value="true" /> | |
135 </section> | |
136 <conditional name="bc"> | |
137 <param name="extract_umi_method" value="umis" /> | |
138 </conditional> | |
139 <section name="umi"> | |
140 <param name="method" value="directional" /> | |
141 </section> | |
168 <param name="wide_format_cell_counts" value="true" /> | 142 <param name="wide_format_cell_counts" value="true" /> |
169 <output name="out_counts" value="count_single_cells_gene_tag_wide.tsv" /> | 143 <output name="out_counts" value="count_single_cells_gene_tag_wide.tsv" /> |
170 </test> | 144 </test> |
171 <test><!-- count ENSDARG00000019692, with defaults --> | 145 <test><!-- count ENSDARG00000019692, with defaults --> |
172 <param name="input_bam" value="fc.ENSDARG00000019692.bam" /> | 146 <param name="input" value="fc.ENSDARG00000019692.bam" /> |
173 <param name="method" value="unique" /> | 147 <section name="advanced"> |
148 <param name="random_seed" value="0" /> | |
149 </section> | |
150 <section name="sc"> | |
151 <param name="gene_tag" value="XT" /> | |
152 <param name="per_cell" value="true" /> | |
153 </section> | |
154 <section name="umi"> | |
155 <param name="method" value="unique" /> | |
156 </section> | |
174 <output name="out_counts" value="fc.ENSDARG00000019692.counts" /> | 157 <output name="out_counts" value="fc.ENSDARG00000019692.counts" /> |
175 </test> | 158 </test> |
176 <test><!-- count ENSDARG00000019692, relabel string --> | 159 <test><!-- count ENSDARG00000019692, relabel string --> |
177 <param name="input_bam" value="fc.ENSDARG00000019692.bam" /> | 160 <param name="input" value="fc.ENSDARG00000019692.bam" /> |
178 <param name="method" value="unique" /> | 161 <section name="advanced"> |
162 <param name="random_seed" value="0" /> | |
163 </section> | |
164 <section name="sc"> | |
165 <param name="gene_tag" value="XT" /> | |
166 <param name="per_cell" value="true" /> | |
167 </section> | |
168 <section name="umi"> | |
169 <param name="method" value="unique" /> | |
170 </section> | |
179 <conditional name="cond_extra" > | 171 <conditional name="cond_extra" > |
180 <param name="prepender" value="string" /> | 172 <param name="prepender" value="string" /> |
181 <param name="custom_label" value="test" /> | 173 <param name="custom_label" value="test" /> |
182 </conditional> | 174 </conditional> |
183 <output name="out_counts" value="fc.ENSDARG00000019692.counts.test" /> | 175 <output name="out_counts" value="fc.ENSDARG00000019692.counts.test" /> |
184 </test> | 176 </test> |
185 <test><!-- count ENSDARG00000019692, relabel filename --> | 177 <test><!-- count ENSDARG00000019692, relabel filename --> |
186 <param name="input_bam" value="fc.ENSDARG00000019692.bam" /> | 178 <param name="input" value="fc.ENSDARG00000019692.bam" /> |
187 <param name="method" value="unique" /> | 179 <section name="advanced"> |
180 <param name="random_seed" value="0" /> | |
181 </section> | |
182 <section name="sc"> | |
183 <param name="gene_tag" value="XT" /> | |
184 <param name="per_cell" value="true" /> | |
185 </section> | |
186 <section name="umi"> | |
187 <param name="method" value="unique" /> | |
188 </section> | |
188 <conditional name="cond_extra" > | 189 <conditional name="cond_extra" > |
189 <param name="prepender" value="dataset name" /> | 190 <param name="prepender" value="dataset name" /> |
190 </conditional> | 191 </conditional> |
191 <output name="out_counts" value="fc.ENSDARG00000019692.counts.name" /> | 192 <output name="out_counts" value="fc.ENSDARG00000019692.counts.name" /> |
192 </test> | 193 </test> |
193 </tests> | 194 </tests> |
194 <help><![CDATA[ | 195 <help><![CDATA[ |
195 | 196 |
196 UMI Tools count - Count reads per gene from BAM using UMIs | 197 count - Count reads per gene from BAM using UMIs and mapping coordinates |
197 ---------------------------------------------------------- | 198 ======================================================================== |
198 | 199 |
199 Purpose | 200 This tool is only designed to work with library preparation |
200 ------- | 201 methods where the fragmentation occurs after amplification, as per |
201 | 202 most single cell RNA-Seq methods (e.g 10x, inDrop, Drop-seq, SCRB-seq |
202 The purpose of this command is to count the number of reads per gene based | 203 and CEL-seq2). Since the precise mapping co-ordinate is not longer |
203 on the mapping co-ordinate and the UMI attached to the read. | 204 informative for such library preparations, it is simplified to the |
204 | 205 gene. This is a reasonable approach providing the number of available |
205 | 206 UMIs is sufficiently high and the sequencing depth is sufficiently low |
206 It is assumed that the FASTQ files were processed with extract_umi.py | 207 that the probability of two reads from the same gene having the same |
207 before mapping and thus the UMI is the last word of the read name. e.g: | 208 UMIs is acceptably low. |
208 | 209 |
209 @HISEQ:87:00000000_AATT | 210 If you want to count reads per gene for library preparations which |
210 | 211 fragment prior to amplification (e.g bulk RNA-Seq), please use |
211 where AATT is the UMI sequeuence. | 212 ``umi_tools dedup`` to remove the duplicate reads as this will use the |
212 | 213 full information from the mapping co-ordinate. Then use a read |
213 If you have used an alternative method which does not separate the | 214 counting tool such as FeatureCounts or HTSeq to count the reads per |
214 read id and UMI with a "_", such as bcl2fastq which uses ":", you can | 215 gene. |
215 specify the separator, or if your UMIs are encoded in a tag you can also specify this. | 216 |
217 In the rare case of bulk RNA-Seq using a library preparation method | |
218 with fragmentation after amplification, one can still use ``count`` but | |
219 note that it has not been tested on bulk RNA-Seq. | |
220 | |
221 This tool deviates from group and dedup in that the ``--per-gene`` option | |
222 is hardcoded on. | |
223 | |
224 @BARCODE_HELP@ | |
225 | |
226 @UMI_GROUPING_HELP@ | |
216 | 227 |
217 ]]></help> | 228 ]]></help> |
218 <expand macro="citations" /> | 229 <expand macro="citations" /> |
219 </tool> | 230 </tool> |