Mercurial > repos > iuc > umi_tools_group
comparison umi-tools_group.xml @ 13:cf25b50eff0a draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit bf6a3aa532e8f9d122da4c1e39f3e256ae587b79"
author | iuc |
---|---|
date | Mon, 13 Sep 2021 14:50:56 +0000 |
parents | 30c3906fbf43 |
children | 257be15474a7 |
comparison
equal
deleted
inserted
replaced
12:bc082a79d655 | 13:cf25b50eff0a |
---|---|
1 <tool id="umi_tools_group" name="UMI-tools group" version="@VERSION@.0"> | 1 <tool id="umi_tools_group" name="UMI-tools group" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> |
2 <description>Extract UMI from fastq files</description> | 2 <description>Extract UMI from fastq files</description> |
3 <expand macro="bio_tools"/> | |
3 <macros> | 4 <macros> |
4 <import>macros.xml</import> | 5 <import>macros.xml</import> |
5 </macros> | 6 </macros> |
6 <expand macro="requirements"> | 7 <expand macro="requirements"> |
7 <requirement type="package" version="1.9">samtools</requirement> | 8 <requirement type="package" version="1.12">samtools</requirement> |
8 </expand> | 9 </expand> |
9 <command detect_errors="exit_code"><![CDATA[ | 10 <command detect_errors="exit_code"><![CDATA[ |
10 #if $input.is_of_type("sam"): | 11 @LINK_SAM_BAM_INPUT@ |
11 #set $input_file = $input | |
12 #else: | |
13 ln -sf '${input}' 'input.bam' && | |
14 ln -sf '$input.metadata.bam_index' 'input.bam.bai' && | |
15 #set $input_file = 'input.bam' | |
16 #end if | |
17 | 12 |
18 umi_tools group | 13 umi_tools group |
19 --random-seed 0 | |
20 --extract-umi-method $extract_umi_method | |
21 #if str($extract_umi_method) != 'read_id': | |
22 --umi-separator '$umi_separator' --umi-tag '$umi_tag' | |
23 #end if | |
24 --method $method --edit-distance-threshold $edit_distance_threshold | |
25 $paired $spliced_is_unique --soft-clip-threshold $soft_clip_threshold | |
26 $read_length $whole_contig --subset $subset $per_contig $per_gene | |
27 #if $gene_transcript_map: | |
28 --gene-transcript-map '$gene_transcript_map' | |
29 #end if | |
30 #if len(str($gene_tag)) > 0: | |
31 --gene-tag '$gene_tag' | |
32 #end if | |
33 #if $group_output: | 14 #if $group_output: |
34 --group-out '$group_out' | 15 --group-out '$group_out' |
35 #end if | 16 #end if |
36 #if $input.is_of_type("sam"): | |
37 --in-sam | |
38 #end if | |
39 --output-bam | 17 --output-bam |
40 -I '$input_file' -S grouped.bam && | 18 @GROUPDEDUP_OPTIONS@ |
41 samtools sort grouped.bam -@ \${GALAXY_SLOTS:-1} -T "\${TMPDIR:-.}" -o '$output' -O BAM | 19 @BARCODE_OPTIONS@ |
20 @UMI_GROUPING_OPTIONS@ | |
21 @SAMBAM_OPTIONS@ | |
22 @FULLSC_OPTIONS@ | |
23 -I '$input_file' -S grouped.bam | |
24 @ADVANCED_OPTIONS@ | |
25 @LOG@ | |
26 ## TODO using samtools sort is a workaround, for the following error that appears when Galaxy | |
27 ## compares the generated file with the one in test-data | |
28 ## `Converting history BAM to SAM failed: 'samtools returned with error 1: stdout=None, stderr=[main_samview] fail to read the header from "/tmp/tmpd8o61jykdedup_out6.bam".\n'. Will compare BAM files` | |
29 ## may be dropped in the future | |
30 --no-sort-output | |
31 && samtools sort --no-PG grouped.bam -@ \${GALAXY_SLOTS:-1} -T "\${TMPDIR:-.}" -o '$output' -O BAM | |
42 ]]></command> | 32 ]]></command> |
43 <inputs> | 33 <inputs> |
44 <param name="input" type="data" format="sam,bam" label="Reads to group in SAM or BAM format" /> | 34 <param name="input" type="data" format="sam,bam" label="Reads to group in SAM or BAM format" /> |
45 <param name="extract_umi_method" argument="--extract-umi-method" type="select"> | |
46 <option value="read_id" selected="True">Read ID</option> | |
47 <option value="tag">Tag</option> | |
48 </param> | |
49 <param name="group_output" argument="--group-out" type="boolean" truevalue="--group-out" falsevalue="" label="Output a flatfile describing the read groups" /> | 35 <param name="group_output" argument="--group-out" type="boolean" truevalue="--group-out" falsevalue="" label="Output a flatfile describing the read groups" /> |
50 <param name="umi_separator" argument="--umi-separator" type="text" label="Separator between read id and UMI." help="Ignored unless extracting by tag" /> | 36 <expand macro="groupdedup_options_macro"/> |
51 <param name="umi_tag" argument="--umi-tag" type="text" label="Tag which contains UMI." /> | 37 <expand macro="barcode_options_macro"/> |
52 <param argument="--method" type="select" label="Method used to identify PCR duplicates within reads." help="All methods start by identifying the reads with the same mapping position"> | 38 <expand macro="umi_grouping_options_macro"/> |
53 <option value="unique">Reads group share the exact same UMI</option> | 39 <expand macro="sambam_options_macro"/> |
54 <option value="cluster">Identify clusters based on hamming distance</option> | 40 <expand macro="fullsc_options_macro"/> |
55 <option value="directional">Identify clusters based on distance and counts, restrict network expansion by threshold</option> | 41 <expand macro="advanced_options_macro"/> |
56 </param> | 42 <expand macro="log_input_macro"/> |
57 <param name="edit_distance_threshold" argument="--edit-distance-threshold" type="integer" value="1" label="Edit distance threshold" help="For the adjacency and cluster methods the threshold for the edit distance to connect two UMIs in the network can be increased. The default value of 1 works best unless the UMI is very long (>14bp)" /> | |
58 <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" label="BAM is paired end" help="This will also force the use of the template length to determine reads with the same mapping coordinates." /> | |
59 <param name="spliced_is_unique" argument="--spliced-is-unique" type="boolean" truevalue="--spliced-is-unique" falsevalue="" label="Spliced reads are unique" help="Causes two reads that start in the same position on the same strand and having the same UMI to be considered unique if one is spliced and the other is not. (Uses the 'N' cigar operation to test for splicing)" /> | |
60 <param name="soft_clip_threshold" argument="--soft-clip-threshold" type="integer" value="4" label="Soft clip threshold" help="Mappers that soft clip, will sometimes do so rather than mapping a spliced read if there is only a small overhang over the exon junction. By setting this option, you can treat reads with at least this many bases soft-clipped at the 3' end as spliced." /> | |
61 <param name="read_length" argument="--read-length" type="boolean" truevalue="--read-length" falsevalue="" label="Use the read length as as a criterion when deduping" /> | |
62 <param name="whole_contig" argument="--whole-contig" type="boolean" truevalue="--whole-contig" falsevalue="" label="Consider all alignments to a single contig together" help="This is useful if you have aligned to a transcriptome multi-fasta" /> | |
63 <param argument="--subset" type="float" min="0.0" max="1.0" value="1.0" label="Only consider a random selection of the reads" /> | |
64 <param argument="--chrom" type="boolean" truevalue="--chrom" falsevalue="" label="Only consider a single chromosome" /> | |
65 <param name="per_contig" argument="--per-contig" type="boolean" truevalue="--per-contig" falsevalue="" label="Deduplicate per contig" help="Field 3 in BAM; RNAME. All reads with the same contig will be considered to have the same alignment position. This is useful if your library prep generates PCR duplicates with non identical alignment positions such as CEL-Seq. In this case, you would align to a reference transcriptome with one transcript per gene" /> | |
66 <param name="per_gene" argument="--per-gene" type="boolean" truevalue="--per-gene" falsevalue="" label="Deduplicate per gene" help="As above except with this option you can align to a reference transcriptome with more than one transcript per gene. You need to also provide a map of genes to transcripts. This will also add a metacontig ('MC') tag to the output BAM file." /> | |
67 <param name="gene_transcript_map" argument="--gene-transcript-map" type="data" format="tabular" optional="True" label="Tabular file mapping genes to transripts" /> | |
68 <param name="gene_tag" argument="--gene-tag" type="text" optional="True" label="Deduplicate by this gene tag" help="As --per-gene except here the gene information is encoded in the bam read tag specified so you do not need to supply the mapping file." /> | |
69 </inputs> | 43 </inputs> |
70 <outputs> | 44 <outputs> |
71 <data format="bam" name="output" /> | 45 <data format="bam" name="output" /> |
72 <data format="tabular" name="group_out"> | 46 <data format="tabular" name="group_out"> |
73 <filter>group_out</filter> | 47 <filter>group_output</filter> |
74 </data> | 48 </data> |
49 <expand macro="log_output_macro"/> | |
75 </outputs> | 50 </outputs> |
76 <tests> | 51 <tests> |
77 <test> | 52 <test expect_num_outputs="1"> |
78 <param name="input" value="group_in2.bam" ftype="bam" /> | 53 <param name="input" value="group_in2.sam" ftype="sam" /> |
79 <param name="extract_umi_method" value="read_id" /> | 54 <section name="advanced"> |
80 <param name="paired" value="True" /> | 55 <param name="random_seed" value="0" /> |
81 <param name="method" value="unique" /> | 56 </section> |
82 <output name="output" file="group_out2.bam" ftype="bam" sort="True" /> | 57 <conditional name="bc"> |
58 <param name="extract_umi_method" value="read_id" /> | |
59 </conditional> | |
60 <section name="sambam"> | |
61 <param name="paired" value="true" /> | |
62 </section> | |
63 <section name="umi"> | |
64 <param name="method" value="unique" /> | |
65 </section> | |
66 <output name="output" file="group_out2.bam" ftype="bam" /> | |
83 </test> | 67 </test> |
84 <test> | 68 <test expect_num_outputs="2"> |
85 <param name="input" value="group_in3.bam" ftype="bam" /> | 69 <param name="input" value="group_in3.bam" ftype="bam" /> |
86 <param name="extract_umi_method" value="read_id" /> | 70 <section name="advanced"> |
87 <param name="group_output" value="True" /> | 71 <param name="random_seed" value="0" /> |
88 <param name="method" value="unique" /> | 72 </section> |
73 <conditional name="bc"> | |
74 <param name="extract_umi_method" value="read_id" /> | |
75 </conditional> | |
76 <param name="group_output" value="true" /> | |
77 <section name="umi"> | |
78 <param name="method" value="unique" /> | |
79 </section> | |
89 <output name="group_out" file="group_out3.tab" /> | 80 <output name="group_out" file="group_out3.tab" /> |
90 <output name="output" file="group_out3.bam" ftype="bam" sort="True" /> | 81 <output name="output" file="group_out3.bam" ftype="bam" /> |
91 </test> | 82 </test> |
92 <test> | 83 <test expect_num_outputs="2"> |
93 <param name="input" value="group_in4.bam" ftype="bam" /> | 84 <param name="input" value="group_in4.bam" ftype="bam" /> |
94 <param name="extract_umi_method" value="tag" /> | 85 <section name="advanced"> |
95 <param name="umi_tag" value="BX" /> | 86 <param name="random_seed" value="0" /> |
96 <param name="method" value="unique" /> | 87 </section> |
88 <conditional name="bc"> | |
89 <param name="extract_umi_method" value="tag" /> | |
90 <param name="umi_tag" value="BX" /> | |
91 </conditional> | |
92 <param name="group_output" value="true" /> | |
93 <section name="umi"> | |
94 <param name="method" value="unique" /> | |
95 </section> | |
97 <output name="group_out" file="group_out4.tab" /> | 96 <output name="group_out" file="group_out4.tab" /> |
98 <output name="output" file="group_out4.bam" ftype="bam" sort="True" /> | 97 <output name="output" file="group_out4.bam" ftype="bam" /> |
99 </test> | 98 </test> |
100 <test> | 99 <test expect_num_outputs="1"> |
101 <param name="input" value="group_in5.bam" ftype="bam" /> | 100 <param name="input" value="group_in5.bam" ftype="bam" /> |
102 <param name="extract_umi_method" value="read_id" /> | 101 <section name="advanced"> |
103 <param name="umi_tag" value="BX" /> | 102 <param name="random_seed" value="0" /> |
104 <param name="method" value="cluster" /> | 103 </section> |
105 <output name="output" file="group_out5.bam" ftype="bam" sort="True" /> | 104 <conditional name="bc"> |
105 <param name="extract_umi_method" value="read_id" /> | |
106 </conditional> | |
107 <section name="umi"> | |
108 <param name="method" value="cluster" /> | |
109 </section> | |
110 <output name="output" file="group_out5.bam" ftype="bam"/> | |
106 </test> | 111 </test> |
107 <test> | 112 <test expect_num_outputs="1"> |
108 <param name="input" value="group_in6.bam" ftype="bam" /> | 113 <param name="input" value="group_in6.bam" ftype="bam" /> |
109 <param name="extract_umi_method" value="read_id" /> | 114 <section name="advanced"> |
110 <param name="umi_tag" value="BX" /> | 115 <param name="random_seed" value="0" /> |
111 <param name="method" value="directional" /> | 116 </section> |
112 <output name="output" file="group_out6.bam" ftype="bam" sort="True" /> | 117 <conditional name="bc"> |
118 <param name="extract_umi_method" value="read_id" /> | |
119 </conditional> | |
120 <section name="umi"> | |
121 <param name="method" value="directional" /> | |
122 </section> | |
123 <output name="output" file="group_out6.bam" ftype="bam"/> | |
113 </test> | 124 </test> |
114 </tests> | 125 </tests> |
115 <help><![CDATA[ | 126 <help><![CDATA[ |
116 umi_tools group - Group reads based on their UMI | 127 umi_tools group - Group reads based on their UMI |
117 ================================================ | 128 ================================================ |
118 | 129 |
119 Purpose | 130 Purpose |
120 ------- | 131 ------- |
121 | 132 |
122 The purpose of this command is to identify groups of reads based on | 133 The purpose of this command is to identify groups of reads based on |
123 their genomic coordinate and UMI. It is assumed that the FASTQ files | 134 their genomic coordinate and UMI. |
124 were processed with umi_tools extract before mapping and thus the UMI is | |
125 the last word of the read name. e.g: | |
126 | |
127 @HISEQ:87:00000000_AATT | |
128 | |
129 where AATT is the UMI sequeuence. | |
130 | |
131 If you have used an alternative method which does not separate the | |
132 read id and UMI with a "_", such as bcl2fastq which uses ":", you can | |
133 specify the separator with the option "--umi-separator=<sep>", | |
134 replacing <sep> with e.g ":". | |
135 | |
136 Alternatively, if your UMIs are encoded in a tag, you can specify this | |
137 by setting the option --extract-umi-method=tag and set the tag name | |
138 with the --umi-tag option. For example, if your UMIs are encoded in | |
139 the 'UM' tag, provide the following options: | |
140 "--extract-umi-method=tag --umi-tag=UM" | |
141 | |
142 By default, reads are considered identical if they have the same start | |
143 coordinate, are on the same strand, and have the same UMI. Optionally, | |
144 splicing status can be considered (see below). | |
145 | |
146 The start postion of a read is considered to be the start of its alignment | |
147 minus any soft clipped bases. A read aligned at position 500 with | |
148 cigar 2S98M will be assumed to start at postion 498. | |
149 | |
150 Methods | |
151 ------- | |
152 | |
153 group can be run with multiple methods to identify group of reads with | |
154 the same (or similar) UMI(s). All methods start by identifying the | |
155 reads with the same mapping position. | |
156 | |
157 The simpliest method, "unique", groups reads with the exact same | |
158 UMI. The network-based methods, "cluster", "adjacency" and | |
159 "directional", build networks where nodes are UMIs and edges connect | |
160 UMIs with an edit distance <= threshold (usually 1). The groups of | |
161 reads are then defined from the network in a method-specific manner. | |
162 | |
163 Note that the "percentile" method used with the dedup command is not | |
164 available with group. This is because this method does not group | |
165 similar UMIs as per the network methods. Instead it applies a | |
166 threshold for inclusion of the UMI in the output and excluded UMIs are | |
167 not assigned to a "true" UMI. | |
168 | |
169 "unique" | |
170 Reads group share the exact same UMI | |
171 | |
172 "cluster" | |
173 Identify clusters of connected UMIs (based on hamming distance | |
174 threshold). Each network is a read group | |
175 | |
176 "directional" | |
177 Identify clusters of connected UMIs (based on hamming distance | |
178 threshold) and umi A counts >= (2* umi B counts) - 1. Each | |
179 network is a read group. | |
180 | 135 |
181 The group command can be used to create two types of outfile: a tagged | 136 The group command can be used to create two types of outfile: a tagged |
182 BAM or a flatfile describing the read groups | 137 BAM or a flatfile describing the read groups |
183 | 138 |
184 To generate the tagged-BAM file, use the option --output-bam and | 139 To generate the tagged-BAM file, use the option --output-bam and |
225 The total number of reads within the group | 180 The total number of reads within the group |
226 | 181 |
227 - unique_id | 182 - unique_id |
228 The unique id for the group | 183 The unique id for the group |
229 | 184 |
185 @BARCODE_HELP@ | |
230 | 186 |
231 Options | 187 @UMI_GROUPING_HELP@ |
232 ------- | |
233 | |
234 --extract-umi-method (choice) | |
235 How are the UMIs encoded in the read? | |
236 | |
237 Options are: | |
238 | |
239 - "read_id" (default) | |
240 UMIs contained at the end of the read separated as | |
241 specified with --umi-separator option | |
242 | |
243 - "tag" | |
244 UMIs contained in a tag, see --umi-tag option | |
245 | |
246 --umi-separator (string) | |
247 Separator between read id and UMI. See --extract-umi-method above | |
248 | |
249 --umi-tag (string) | |
250 Tag which contains UMI. See --extract-umi-method above | |
251 | |
252 --method (choice, string) | |
253 Method used to identify PCR duplicates within reads. All methods | |
254 start by identifying the reads with the same mapping position | |
255 | |
256 Options are: | |
257 | |
258 - "unique" | |
259 Reads group share the exact same UMI | |
260 | |
261 - "cluster" | |
262 Identify clusters of connected UMIs (based on edit distance | |
263 threshold). Each network is a read group | |
264 | |
265 - "directional" | |
266 Identify clusters of connected UMIs (based on edit distance | |
267 threshold) and umi A counts >= (2* umi B counts) - 1. Each | |
268 network is a read group. | |
269 | |
270 --edit-distance-threshold (int) | |
271 For the adjacency and cluster methods the threshold for the | |
272 edit distance to connect two UMIs in the network can be | |
273 increased. The default value of 1 works best unless the UMI is | |
274 very long (>14bp) | |
275 | |
276 --paired | |
277 BAM is paired end - output both read pairs. This will also | |
278 force the use of the template length to determine reads with | |
279 the same mapping coordinates. | |
280 | |
281 --spliced-is-unique | |
282 Causes two reads that start in the same position on the same | |
283 strand and having the same UMI to be considered unique if one is | |
284 spliced and the other is not. (Uses the 'N' cigar operation to test | |
285 for splicing) | |
286 | |
287 --soft-clip-threshold (int) | |
288 Mappers that soft clip, will sometimes do so rather than mapping a | |
289 spliced read if there is only a small overhang over the exon | |
290 junction. By setting this option, you can treat reads with at least | |
291 this many bases soft-clipped at the 3' end as spliced. | |
292 | |
293 --multimapping-detection-method (string, choice) | |
294 If the sam/bam contains tags to identify multimapping reads, you can | |
295 specify for use when selecting the best read at a given loci. | |
296 Supported tags are "NH", "X0" and "XT". If not specified, the read | |
297 with the highest mapping quality will be selected | |
298 | |
299 --read-length | |
300 Use the read length as as a criteria when deduping, for e.g sRNA-Seq | |
301 | |
302 --whole-contig | |
303 Consider all alignments to a single contig together. This is useful if | |
304 you have aligned to a transcriptome multi-fasta | |
305 | |
306 --subset (float, [0-1]) | |
307 Only consider a fraction of the reads, chosen at random. This is useful | |
308 for doing saturation analyses. | |
309 | |
310 --chrom | |
311 Only consider a single chromosome. This is useful for debugging purposes | |
312 | |
313 --per-contig (string) | |
314 Deduplicate per contig (field 3 in BAM; RNAME). | |
315 All reads with the same contig will be | |
316 considered to have the same alignment position. This is useful | |
317 if your library prep generates PCR duplicates with non identical | |
318 alignment positions such as CEL-Seq. In this case, you would | |
319 align to a reference transcriptome with one transcript per gene | |
320 | |
321 --per-gene (string) | |
322 Deduplicate per gene. As above except with this option you can | |
323 align to a reference transcriptome with more than one transcript | |
324 per gene. You need to also provide --gene-transcript-map option. | |
325 This will also add a metacontig ('MC') tag to the reads if used | |
326 in conjunction with --output-bam | |
327 | |
328 --gene-transcript-map (string) | |
329 File mapping genes to transripts (tab separated), e.g: | |
330 | |
331 gene1 transcript1 | |
332 gene1 transcript2 | |
333 gene2 transcript3 | |
334 | |
335 --gene-tag (string) | |
336 Deduplicate per gene. As per --per-gene except here the gene | |
337 information is encoded in the bam read tag specified so you do | |
338 not need to supply --gene-transcript-map | |
339 | |
340 --group-out (string, filename) | |
341 Output a flatfile describing the read groups | |
342 | |
343 --output-bam (string, filename) | |
344 Output a tagged bam file to stdout or -S <filename> | |
345 | |
346 -i, --in-sam/-o, --out-sam | |
347 By default, inputs are assumed to be in BAM format and output are output | |
348 in BAM format. Use these options to specify the use of SAM format for | |
349 inputs or outputs. | |
350 | |
351 -I (string, filename) input file name | |
352 The input file must be sorted and indexed. | |
353 | |
354 -S (string, filename) output file name | |
355 | |
356 -L (string, filename) log file name | |
357 | |
358 Usage | |
359 ----- | |
360 umi_tools group -I infile.bam --output-bam -S grouped.bam -L group.log -- | |
361 | |
362 ]]></help> | 188 ]]></help> |
363 <expand macro="citations" /> | 189 <expand macro="citations" /> |
364 </tool> | 190 </tool> |