comparison umi-tools_group.xml @ 0:860bc357b678 draft

planemo upload commit c8e46ecad0b1473097517e582ed6c43eb0635b36
author iuc
date Tue, 29 Aug 2017 17:37:21 -0400
parents
children f73f13641bb6
comparison
equal deleted inserted replaced
-1:000000000000 0:860bc357b678
1 <tool id="umi_tools_group" name="UMI-tools group" version="@VERSION@.0">
2 <description>Extract UMI from fastq files</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements">
7 <requirement type="package" version="1.5">samtools</requirement>
8 </expand>
9 <command detect_errors="exit_code"><![CDATA[
10 #if $input.is_of_type("sam"):
11 #set $input_file = $input
12 #else:
13 ln -sf '${input}' 'input.bam' &&
14 ln -sf '$input.metadata.bam_index' 'input.bam.bai' &&
15 #set $input_file = 'input.bam'
16 #end if
17
18 umi_tools group --extract-umi-method $extract_umi_method
19 #if str($extract_umi_method) != 'read_id':
20 --umi-separator '$umi_separator' --umi-tag $umi_tag
21 #end if
22 --method $method --edit-distance-threshold $edit_distance_threshold
23 $paired $spliced_is_unique --soft-clip-threshold $soft_clip_threshold
24 $read_length $whole_contig --subset $subset $per_contig $per_gene
25 #if $gene_transcript_map:
26 --gene-transcript-map '$gene_transcript_map'
27 #end if
28 #if len(str($gene_tag)) > 0:
29 --gene-tag '$gene_tag'
30 #end if
31 #if $group_output:
32 --group-out '$group_out'
33 #end if
34 #if $input.is_of_type("sam"):
35 --in-sam
36 #end if
37 --output-bam
38 -I '$input_file' -S grouped.bam &&
39 samtools sort grouped.bam -@ \${GALAXY_SLOTS:-1} -o '$output' -O BAM
40 ]]></command>
41 <inputs>
42 <param name="input" type="data" format="sam,bam" label="Reads to group in SAM or BAM format" />
43 <param name="extract_umi_method" argument="--extract-umi-method" type="select">
44 <option value="read_id" selected="True">Read ID</option>
45 <option value="tag">Tag</option>
46 </param>
47 <param name="group_output" argument="--group-out" type="boolean" truevalue="--group-out" falsevalue="" label="Output a flatfile describing the read groups" />
48 <param name="umi_separator" argument="--umi-separator" type="text" label="Separator between read id and UMI." help="Ignored unless extracting by tag" />
49 <param name="umi_tag" argument="--umi-tag" type="text" label="Tag which contains UMI." />
50 <param argument="--method" type="select" label="Method used to identify PCR duplicates within reads." help="All methods start by identifying the reads with the same mapping position">
51 <option value="unique">Reads group share the exact same UMI</option>
52 <option value="cluster">Identify clusters based on hamming distance</option>
53 <option value="directional">Identify clusters based on distance and counts</option>
54 </param>
55 <param name="edit_distance_threshold" argument="--edit-distance-threshold" type="integer" value="1" label="Edit distance threshold" help="For the adjacency and cluster methods the threshold for the edit distance to connect two UMIs in the network can be increased. The default value of 1 works best unless the UMI is very long (&gt;14bp)" />
56 <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" label="BAM is paired end" help="This will also force the use of the template length to determine reads with the same mapping coordinates." />
57 <param name="spliced_is_unique" argument="--spliced-is-unique" type="boolean" truevalue="--spliced-is-unique" falsevalue="" label="Spliced reads are unique" help="Causes two reads that start in the same position on the same strand and having the same UMI to be considered unique if one is spliced and the other is not. (Uses the 'N' cigar operation to test for splicing)" />
58 <param name="soft_clip_threshold" argument="--soft-clip-threshold" type="integer" value="4" label="Soft clip threshold" help="Mappers that soft clip, will sometimes do so rather than mapping a spliced read if there is only a small overhang over the exon junction. By setting this option, you can treat reads with at least this many bases soft-clipped at the 3' end as spliced." />
59 <param name="read_length" argument="--read-length" type="boolean" truevalue="--read-length" falsevalue="" label="Use the read length as as a criterion when deduping" />
60 <param name="whole_contig" argument="--whole-contig" type="boolean" truevalue="--whole-contig" falsevalue="" label="Consider all alignments to a single contig together" help="This is useful if you have aligned to a transcriptome multi-fasta" />
61 <param argument="--subset" type="float" min="0" max="1" value="1" label="Only consider a random selection of the reads" />
62 <param argument="--chrom" type="boolean" truevalue="--chrom" falsevalue="" label="Only consider a single chromosome" />
63 <param name="per_contig" argument="--per-contig" type="boolean" truevalue="--per-contig" falsevalue="" label="Deduplicate per contig" help="Field 3 in BAM; RNAME. All reads with the same contig will be considered to have the same alignment position. This is useful if your library prep generates PCR duplicates with non identical alignment positions such as CEL-Seq. In this case, you would align to a reference transcriptome with one transcript per gene" />
64 <param name="per_gene" argument="--per-gene" type="boolean" truevalue="--per-gene" falsevalue="" label="Deduplicate per gene" help="As above except with this option you can align to a reference transcriptome with more than one transcript per gene. You need to also provide a map of genes to transcripts. This will also add a metacontig ('MC') tag to the output BAM file." />
65 <param name="gene_transcript_map" argument="--gene-transcript-map" type="data" format="tabular" optional="True" label="Tabular file mapping genes to transripts" />
66 <param name="gene_tag" argument="--gene-tag" type="text" optional="True" label="Deduplicate by this gene tag" help="As --per-gene except here the gene information is encoded in the bam read tag specified so you do not need to supply the mapping file." />
67 </inputs>
68 <outputs>
69 <data format="bam" name="output" />
70 <data format="tabular" name="group_out">
71 <filter>group_out</filter>
72 </data>
73 </outputs>
74 <tests>
75 <test>
76 <param name="input" value="group_in1.sam" ftype="sam" />
77 <param name="extract_umi_method" value="read_id" />
78 <param name="method" value="unique" />
79 <output name="output" file="group_out1.bam" />
80 </test>
81 <test>
82 <param name="input" value="group_in2.bam" ftype="bam" />
83 <param name="extract_umi_method" value="read_id" />
84 <param name="paired" value="True" />
85 <param name="method" value="unique" />
86 <output name="output" file="group_out2.bam" />
87 </test>
88 <test>
89 <param name="input" value="group_in3.bam" ftype="bam" />
90 <param name="extract_umi_method" value="read_id" />
91 <param name="group_output" value="True" />
92 <param name="method" value="unique" />
93 <output name="group_out" file="group_out3.tab" />
94 <output name="output" file="group_out3.bam" />
95 </test>
96 <test>
97 <param name="input" value="group_in4.bam" ftype="bam" />
98 <param name="extract_umi_method" value="tag" />
99 <param name="umi_tag" value="BX" />
100 <param name="method" value="unique" />
101 <output name="group_out" file="group_out4.tab" />
102 <output name="output" file="group_out4.bam" />
103 </test>
104 <test>
105 <param name="input" value="group_in5.bam" ftype="bam" />
106 <param name="extract_umi_method" value="read_id" />
107 <param name="umi_tag" value="BX" />
108 <param name="method" value="cluster" />
109 <output name="output" file="group_out5.bam" />
110 </test>
111 <test>
112 <param name="input" value="group_in6.bam" ftype="bam" />
113 <param name="extract_umi_method" value="read_id" />
114 <param name="umi_tag" value="BX" />
115 <param name="method" value="directional" />
116 <output name="output" file="group_out6.bam" />
117 </test>
118 </tests>
119 <help><![CDATA[
120 umi_tools group - Group reads based on their UMI
121 ================================================
122
123 Purpose
124 -------
125
126 The purpose of this command is to identify groups of reads based on
127 their genomic coordinate and UMI. It is assumed that the FASTQ files
128 were processed with umi_tools extract before mapping and thus the UMI is
129 the last word of the read name. e.g:
130
131 @HISEQ:87:00000000_AATT
132
133 where AATT is the UMI sequeuence.
134
135 If you have used an alternative method which does not separate the
136 read id and UMI with a "_", such as bcl2fastq which uses ":", you can
137 specify the separator with the option "--umi-separator=<sep>",
138 replacing <sep> with e.g ":".
139
140 Alternatively, if your UMIs are encoded in a tag, you can specify this
141 by setting the option --extract-umi-method=tag and set the tag name
142 with the --umi-tag option. For example, if your UMIs are encoded in
143 the 'UM' tag, provide the following options:
144 "--extract-umi-method=tag --umi-tag=UM"
145
146 By default, reads are considered identical if they have the same start
147 coordinate, are on the same strand, and have the same UMI. Optionally,
148 splicing status can be considered (see below).
149
150 The start postion of a read is considered to be the start of its alignment
151 minus any soft clipped bases. A read aligned at position 500 with
152 cigar 2S98M will be assumed to start at postion 498.
153
154 Methods
155 -------
156
157 group can be run with multiple methods to identify group of reads with
158 the same (or similar) UMI(s). All methods start by identifying the
159 reads with the same mapping position.
160
161 The simpliest method, "unique", groups reads with the exact same
162 UMI. The network-based methods, "cluster", "adjacency" and
163 "directional", build networks where nodes are UMIs and edges connect
164 UMIs with an edit distance <= threshold (usually 1). The groups of
165 reads are then defined from the network in a method-specific manner.
166
167 Note that the "percentile" method used with the dedup command is not
168 available with group. This is because this method does not group
169 similar UMIs as per the network methods. Instead it applies a
170 threshold for inclusion of the UMI in the output and excluded UMIs are
171 not assigned to a "true" UMI.
172
173 "unique"
174 Reads group share the exact same UMI
175
176 "cluster"
177 Identify clusters of connected UMIs (based on hamming distance
178 threshold). Each network is a read group
179
180 "directional"
181 Identify clusters of connected UMIs (based on hamming distance
182 threshold) and umi A counts >= (2* umi B counts) - 1. Each
183 network is a read group.
184
185 The group command can be used to create two types of outfile: a tagged
186 BAM or a flatfile describing the read groups
187
188 To generate the tagged-BAM file, use the option --output-bam and
189 provide a filename with the -S option. Alternatively, if you do not
190 provide a filename, the bam file will be outputted to the stdout. If
191 you have provided the --log/-L option to send the logging output
192 elsewhere, you can pipe the output from the group command directly to
193 e.g samtools sort like so:
194
195 ``umi_tools group -I inf.bam --group-out=grouped.tsv --output-bam --log=group.log --paired | samtools sort - -o grouped_sorted.bam``
196
197 The tagged-BAM file will have two tagged per read:
198
199 - UG = Unique_id.
200 0-indexed unique id number for each group of reads with the same genomic position and UMI or UMIs inferred to be from the same true UMI + errors
201
202 - BX = Final UMI.
203 The inferred true UMI for the group
204
205 To generate the flatfile describing the read groups, include the
206 --group-out=<filename> option. The columns of the read groups file are
207 below. The first five columns relate to the read. The final 3 columns
208 relate to the group.
209
210 - read_id
211 read identifier
212
213 - contig
214 alignment contig
215
216 - position
217 Alignment position. Note that this position is not the start position of the read in the BAM file but the start of the read taking into account the read strand and cigar
218
219 - umi
220 The read UMI
221
222 - umi_count
223 The number of times this UMI is observed for reads at the same position
224
225 - final_umi
226 The inferred true UMI for the group
227
228 - final_umi_count
229 The total number of reads within the group
230
231 - unique_id
232 The unique id for the group
233
234
235 Options
236 -------
237
238 --extract-umi-method (choice)
239 How are the UMIs encoded in the read?
240
241 Options are:
242
243 - "read_id" (default)
244 UMIs contained at the end of the read separated as
245 specified with --umi-separator option
246
247 - "tag"
248 UMIs contained in a tag, see --umi-tag option
249
250 --umi-separator (string)
251 Separator between read id and UMI. See --extract-umi-method above
252
253 --umi-tag (string)
254 Tag which contains UMI. See --extract-umi-method above
255
256 --method (choice, string)
257 Method used to identify PCR duplicates within reads. All methods
258 start by identifying the reads with the same mapping position
259
260 Options are:
261
262 - "unique"
263 Reads group share the exact same UMI
264
265 - "cluster"
266 Identify clusters of connected UMIs (based on edit distance
267 threshold). Each network is a read group
268
269 - "directional"
270 Identify clusters of connected UMIs (based on edit distance
271 threshold) and umi A counts >= (2* umi B counts) - 1. Each
272 network is a read group.
273
274 --edit-distance-threshold (int)
275 For the adjacency and cluster methods the threshold for the
276 edit distance to connect two UMIs in the network can be
277 increased. The default value of 1 works best unless the UMI is
278 very long (>14bp)
279
280 --paired
281 BAM is paired end - output both read pairs. This will also
282 force the use of the template length to determine reads with
283 the same mapping coordinates.
284
285 --spliced-is-unique
286 Causes two reads that start in the same position on the same
287 strand and having the same UMI to be considered unique if one is
288 spliced and the other is not. (Uses the 'N' cigar operation to test
289 for splicing)
290
291 --soft-clip-threshold (int)
292 Mappers that soft clip, will sometimes do so rather than mapping a
293 spliced read if there is only a small overhang over the exon
294 junction. By setting this option, you can treat reads with at least
295 this many bases soft-clipped at the 3' end as spliced.
296
297 --multimapping-detection-method (string, choice)
298 If the sam/bam contains tags to identify multimapping reads, you can
299 specify for use when selecting the best read at a given loci.
300 Supported tags are "NH", "X0" and "XT". If not specified, the read
301 with the highest mapping quality will be selected
302
303 --read-length
304 Use the read length as as a criteria when deduping, for e.g sRNA-Seq
305
306 --whole-contig
307 Consider all alignments to a single contig together. This is useful if
308 you have aligned to a transcriptome multi-fasta
309
310 --subset (float, [0-1])
311 Only consider a fraction of the reads, chosen at random. This is useful
312 for doing saturation analyses.
313
314 --chrom
315 Only consider a single chromosome. This is useful for debugging purposes
316
317 --per-contig (string)
318 Deduplicate per contig (field 3 in BAM; RNAME).
319 All reads with the same contig will be
320 considered to have the same alignment position. This is useful
321 if your library prep generates PCR duplicates with non identical
322 alignment positions such as CEL-Seq. In this case, you would
323 align to a reference transcriptome with one transcript per gene
324
325 --per-gene (string)
326 Deduplicate per gene. As above except with this option you can
327 align to a reference transcriptome with more than one transcript
328 per gene. You need to also provide --gene-transcript-map option.
329 This will also add a metacontig ('MC') tag to the reads if used
330 in conjunction with --output-bam
331
332 --gene-transcript-map (string)
333 File mapping genes to transripts (tab separated), e.g:
334
335 gene1 transcript1
336 gene1 transcript2
337 gene2 transcript3
338
339 --gene-tag (string)
340 Deduplicate per gene. As per --per-gene except here the gene
341 information is encoded in the bam read tag specified so you do
342 not need to supply --gene-transcript-map
343
344 --group-out (string, filename)
345 Output a flatfile describing the read groups
346
347 --output-bam (string, filename)
348 Output a tagged bam file to stdout or -S <filename>
349
350 -i, --in-sam/-o, --out-sam
351 By default, inputs are assumed to be in BAM format and output are output
352 in BAM format. Use these options to specify the use of SAM format for
353 inputs or outputs.
354
355 -I (string, filename) input file name
356 The input file must be sorted and indexed.
357
358 -S (string, filename) output file name
359
360 -L (string, filename) log file name
361
362 Usage
363 -----
364 umi_tools group -I infile.bam --output-bam -S grouped.bam -L group.log --
365
366 ]]></help>
367 <expand macro="citations" />
368 </tool>