Mercurial > repos > iuc > umi_tools_extract
comparison macros.xml @ 15:27ac32a22ad2 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/umi_tools commit bf6a3aa532e8f9d122da4c1e39f3e256ae587b79"
author | iuc |
---|---|
date | Mon, 13 Sep 2021 14:52:06 +0000 |
parents | d5ff68d2d5ff |
children | 7accf7407811 |
comparison
equal
deleted
inserted
replaced
14:9fa7803d1c51 | 15:27ac32a22ad2 |
---|---|
1 <?xml version="1.0"?> | 1 <?xml version="1.0"?> |
2 <macros> | 2 <macros> |
3 | |
4 <!-- macros applying to all umi_tools --> | |
5 | |
6 <token name="@TOOL_VERSION@">1.1.2</token> | |
7 <token name="@VERSION_SUFFIX@">0</token> | |
8 <token name="@PROFILE@">21.01</token> | |
9 <xml name="requirements"> | |
10 <requirements> | |
11 <requirement type="package" version="@TOOL_VERSION@">umi_tools</requirement> | |
12 <yield /> | |
13 </requirements> | |
14 </xml> | |
15 <xml name="citations"> | |
16 <citations> | |
17 <citation type="doi">10.1101/gr.209601.116</citation> | |
18 <citation type="bibtex"> | |
19 @misc{githubUMI-tools, | |
20 title = {UMI-tools}, | |
21 publisher = {GitHub}, | |
22 journal = {GitHub repository}, | |
23 url = {https://github.com/CGATOxford/UMI-tools}, | |
24 } | |
25 </citation> | |
26 </citations> | |
27 </xml> | |
28 <xml name="advanced_options_macro"> | |
29 <section name="advanced" title="Extra parameters" expanded="false"> | |
30 <param argument="--random-seed" type="integer" min="0" optional="true" label="Random Seed" /> | |
31 </section> | |
32 </xml> | |
33 <token name="@ADVANCED_OPTIONS@"><![CDATA[ | |
34 #if str($advanced.random_seed) != '' | |
35 --random-seed='$advanced.random_seed' | |
36 #end if | |
37 ]]></token> | |
38 | |
39 <!-- macros for extract and whitelist--> | |
40 | |
3 <macro name="barcode_sanitizer" > | 41 <macro name="barcode_sanitizer" > |
4 <sanitizer invalid_char=""> | 42 <sanitizer invalid_char=""> |
5 <valid initial="string.letters,string.digits"> | 43 <valid initial="string.letters,string.digits"> |
6 <add value="*" /><!-- asterisk --> | 44 <add value="*" /><!-- asterisk --> |
7 <add value="," /><!-- comma --> | 45 <add value="," /><!-- comma --> |
21 <add value="-"/> | 59 <add value="-"/> |
22 <add value="!"/> | 60 <add value="!"/> |
23 </valid> | 61 </valid> |
24 </sanitizer> | 62 </sanitizer> |
25 </macro> | 63 </macro> |
26 <macro name="barcode2_conditional" > | 64 <xml name="sanitize_tag" > |
27 <conditional name="barcode"> | 65 <sanitizer invalid_char=""> |
28 <param name="barcode_select" argument="--split-barcode" type="select" label="Barcode on both reads?"> | 66 <valid initial="string.letters,string.digits" /> |
29 <option value="first_read_only">Barcode on first read only</option> | 67 </sanitizer> |
30 <option value="both_reads">Barcode on both reads</option> | 68 </xml> |
31 </param> | 69 <macro name="barcode1_macro" > |
32 <when value="first_read_only"/> | 70 <param argument="--bc-pattern" type="text" label="Barcode pattern for first read" |
33 <when value="both_reads"> | 71 help="Use this option to specify the format of the UMI/barcode. Use Ns to |
34 <param name="bc_pattern2" argument="--bc-pattern2" type="text" value="" label="Barcode pattern for second read" | 72 represent the random positions and Xs to indicate the bc positions. |
35 help="Use this option to specify the format of the UMI/barcode for | 73 Bases with Ns will be extracted and added to the read name. Remaining |
36 the second read pair if required." > | 74 bases, marked with an X will be reattached to the read"> |
37 <expand macro="barcode_sanitizer" /> | 75 <validator type="empty_field" /> |
38 </param> | 76 <expand macro="barcode_sanitizer" /> |
39 </when> | 77 </param> |
40 </conditional> | |
41 </macro> | 78 </macro> |
79 <macro name="barcode2_macro" > | |
80 <param argument="--bc-pattern2" type="text" value="" label="Barcode pattern for second read" | |
81 help="Use this option to specify the format of the UMI/barcode for | |
82 the second read pair if required" > | |
83 <expand macro="barcode_sanitizer" /> | |
84 </param> | |
85 </macro> | |
86 <!-- not just fastq because this would allow also fastqcsanger --> | |
87 <token name="@FASTQ_FORMATS@">fastqsanger,fastqsanger.gz,fastqillumina,fastqillumina.gz,fastqsolexa,fastqsolexa.gz</token> | |
88 <xml name="bio_tools"> | |
89 <xrefs> | |
90 <xref type="bio.tools">umi-tools</xref> | |
91 </xrefs> | |
92 </xml> | |
42 <xml name="input_types"> | 93 <xml name="input_types"> |
43 <conditional name="input_type"> | 94 <conditional name="input_type_cond"> |
44 <param name="type" type="select" label="Library type"> | 95 <param name="input_type" type="select" label="Library type"> |
45 <option value="single">Single-end</option> | 96 <option value="single">Single-end</option> |
46 <option value="paired">Paired-end</option> | 97 <option value="paired">Paired-end</option> |
47 <option value="paired_collection">Paired-end Dataset Collection</option> | 98 <option value="paired_collection">Paired-end Dataset Collection</option> |
48 </param> | 99 </param> |
49 <when value="single"> | 100 <when value="single"> |
50 <param name="input_single" type="data" format="fastq,fastq.gz" label="Reads in FASTQ format" /> | 101 <param name="input_read1" type="data" format="@FASTQ_FORMATS@" label="Reads in FASTQ format" /> |
102 <expand macro="barcode1_macro"/> | |
51 </when> | 103 </when> |
52 <when value="paired"> | 104 <when value="paired"> |
53 <param name="input_read1" type="data" format="fastq,fastq.gz" label="Reads in FASTQ format" /> | 105 <param name="input_read1" type="data" format="@FASTQ_FORMATS@" label="Reads in FASTQ format" /> |
54 <param name="input_read2" type="data" format="fastq,fastq.gz" label="Reads in FASTQ format" /> | 106 <param name="input_read2" type="data" format="@FASTQ_FORMATS@" label="Reads in FASTQ format" /> |
55 <expand macro="barcode2_conditional" /> | 107 <expand macro="barcode1_macro"/> |
108 <expand macro="barcode2_macro"/> | |
109 <yield/> | |
56 </when> | 110 </when> |
57 <when value="paired_collection"> | 111 <when value="paired_collection"> |
58 <param name="input_readpair" type="data_collection" collection_type="paired" format="fastq,fastq.gz" label="Reads in FASTQ format" /> | 112 <param name="input_readpair" type="data_collection" collection_type="paired" format="@FASTQ_FORMATS@" label="Reads in FASTQ format" /> |
59 <expand macro="barcode2_conditional" /> | 113 <expand macro="barcode1_macro"/> |
114 <expand macro="barcode2_macro"/> | |
115 <yield/> | |
60 </when> | 116 </when> |
61 </conditional> | 117 </conditional> |
62 </xml> | 118 </xml> |
63 <xml name="citations"> | |
64 <citations> | |
65 <citation type="doi">10.1101/gr.209601.116</citation> | |
66 <citation type="bibtex"> | |
67 @misc{githubUMI-tools, | |
68 title = {UMI-tools}, | |
69 publisher = {GitHub}, | |
70 journal = {GitHub repository}, | |
71 url = {https://github.com/CGATOxford/UMI-tools}, | |
72 } | |
73 </citation> | |
74 </citations> | |
75 </xml> | |
76 <xml name="requirements"> | |
77 <requirements> | |
78 <requirement type="package" version="@VERSION@">umi_tools</requirement> | |
79 <yield /> | |
80 </requirements> | |
81 </xml> | |
82 <token name="@VERSION@">0.5.5</token> | |
83 <token name="@COMMAND_LINK@"><![CDATA[ | 119 <token name="@COMMAND_LINK@"><![CDATA[ |
84 #set $gz = False | 120 #set $gz = False |
85 #if $input_type.type == 'single': | 121 #if $input_type_cond.input_type == 'single': |
86 #if $input_type.input_single.is_of_type("fastq.gz", "fastqsanger.gz"): | 122 #if $input_type_cond.input_read1.is_of_type("fastq.gz", "fastqsanger.gz"): |
87 ln -s '$input_type.input_single' input_single.gz && | 123 ln -s '$input_type_cond.input_read1' input_single.gz && |
88 #set $gz = True | 124 #set $gz = True |
89 #else | 125 #else |
90 ln -s '$input_type.input_single' input_single.txt && | 126 ln -s '$input_type_cond.input_read1' input_single.txt && |
91 #end if | 127 #end if |
92 #elif $input_type.type == 'paired': | 128 #elif $input_type_cond.input_type == 'paired': |
93 #if $input_type.input_read1.is_of_type("fastq.gz", "fastqsanger.gz"): | 129 #if $input_type_cond.input_read1.is_of_type("fastq.gz", "fastqsanger.gz"): |
94 ln -s '$input_type.input_read1' input_read1.gz && | 130 ln -s '$input_type_cond.input_read1' input_read1.gz && |
95 ln -s '$input_type.input_read2' input_read2.gz && | 131 ln -s '$input_type_cond.input_read2' input_read2.gz && |
96 #set $gz = True | 132 #set $gz = True |
97 #else | 133 #else |
98 ln -s '$input_type.input_read1' input_read1.txt && | 134 ln -s '$input_type_cond.input_read1' input_read1.txt && |
99 ln -s '$input_type.input_read2' input_read2.txt && | 135 ln -s '$input_type_cond.input_read2' input_read2.txt && |
100 #end if | 136 #end if |
101 #else ## paired_collection | 137 #else ## paired_collection |
102 #if $input_type.input_readpair.forward.is_of_type("fastq.gz", "fastqsanger.gz"): | 138 #if $input_type_cond.input_readpair.forward.is_of_type("fastq.gz", "fastqsanger.gz"): |
103 ln -s '$input_type.input_readpair.forward' input_read1.gz && | 139 ln -s '$input_type_cond.input_readpair.forward' input_read1.gz && |
104 ln -s '$input_type.input_readpair.reverse' input_read2.gz && | 140 ln -s '$input_type_cond.input_readpair.reverse' input_read2.gz && |
105 #set $gz = True | 141 #set $gz = True |
106 #else | 142 #else |
107 ln -s '$input_type.input_readpair.forward' input_read1.txt && | 143 ln -s '$input_type_cond.input_readpair.forward' input_read1.txt && |
108 ln -s '$input_type.input_readpair.reverse' input_read2.txt && | 144 ln -s '$input_type_cond.input_readpair.reverse' input_read2.txt && |
109 #end if | 145 #end if |
110 #end if | 146 #end if |
111 ]]></token> | 147 ]]></token> |
148 | |
149 <!-- macros for count, dedup, and group --> | |
150 | |
151 <token name="@LINK_SAM_BAM_INPUT@"><![CDATA[ | |
152 #if $input.is_of_type("sam"): | |
153 ## TODO dedup has problems with SAM input in some cases | |
154 ## https://github.com/CGATOxford/UMI-tools/issues/483 | |
155 ## so convert it to sorted BAM for now | |
156 ## #set $input_file = $input | |
157 samtools sort --no-PG '$input' > 'input.bam' && | |
158 samtools index -b 'input.bam' && | |
159 #set $input_file = 'input.bam' | |
160 #else: | |
161 ln -sf '${input}' 'input.bam' && | |
162 ln -sf '$input.metadata.bam_index' 'input.bam.bai' && | |
163 #set $input_file = 'input.bam' | |
164 #end if | |
165 ]]></token> | |
166 <token name="@SET_INPUT_TYPE@"><![CDATA[ | |
167 ## TODO see comment in LINK_SAM_BAM_INPUT | |
168 ## #if $input.is_of_type("sam"): | |
169 ## --in-sam | |
170 ## #end if | |
171 ]]></token> | |
172 | |
173 <xml name="fastq_barcode_extraction_options_macro"> | |
174 <conditional name="extract_method_cond"> | |
175 <param argument="--extract-method" type="select" label="Barcode Extraction Method" | |
176 help="If bracketed expressions are used in the above barcode pattern, then set this to 'regex'. Otherwise leave as 'string'" > | |
177 <option value="string" selected="true" /> | |
178 <option value="regex" /> | |
179 </param> | |
180 <when value="string"> | |
181 <param argument="--3prime" name="prime3" type="boolean" label="Is barcode on 3' end of the read?" | |
182 truevalue="--3prime" falsevalue="" | |
183 help="By default the barcode is assumed to be on the 5' end of the read, but | |
184 use this option to specify that it is on the 3' end instead. | |
185 This option only works with ``--extract-method=string`` | |
186 since 3' encoding can be specified explicitly with a regex, e.g | |
187 ``.*(?P<umi_1>.{5})$``" /> | |
188 </when> | |
189 <when value="regex"> | |
190 <param name="filtered_out_bool" type="boolean" label="Write out reads not matching regex pattern"/> | |
191 </when> | |
192 </conditional> | |
193 <param argument="--ignore-read-pair-suffixes" type="boolean" truevalue="--ignore-read-pair-suffixes" falsevalue="" label="Ignore '\1' and '\2' read name suffixes"/> | |
194 </xml> | |
195 <token name="@FASTQ_BARCODE_EXTRACTION_OPTIONS@"><![CDATA[ | |
196 ## fastq barcode extraction options: | |
197 --extract-method='$extract_method_cond.extract_method' | |
198 --bc-pattern='$input_type_cond.bc_pattern' | |
199 #if $input_type_cond.input_type != 'single' and $input_type_cond.bc_pattern2 != '' | |
200 --bc-pattern2='$input_type_cond.bc_pattern2' | |
201 #end if | |
202 #if $extract_method_cond.extract_method == 'string' | |
203 $extract_method_cond.prime3 | |
204 #else if $extract_method_cond.filtered_out_bool | |
205 #if $input_type_cond.input_type == 'single': | |
206 --filtered-out='$filtered_out' | |
207 #else if $input_type_cond.input_type == 'paired': | |
208 --filtered-out='$filtered_out' | |
209 --filtered-out2='$filtered_out_paired' | |
210 #else | |
211 --filtered-out='$filtered_out_paired_collection.forward' | |
212 --filtered-out2='$filtered_out_paired_collection.reverse' | |
213 #end if | |
214 #end if | |
215 $ignore_read_pair_suffixes | |
216 ]]></token> | |
217 <token name="@FASTQ_BARCODE_EXTRACTION_HELP@"><![CDATA[ | |
218 There are two methods enabled to extract the umi barcode (+/- | |
219 cell barcode). For both methods, the patterns should be provided | |
220 using the ``--bc-pattern`` and ``--bc-pattern2`` options.x | |
221 | |
222 - ``string`` | |
223 This should be used where the barcodes are always in the same | |
224 place in the read. | |
225 | |
226 - N = UMI position (required) | |
227 - C = cell barcode position (optional) | |
228 - X = sample position (optional) | |
229 | |
230 Bases with Ns and Cs will be extracted and added to the read | |
231 name. The corresponding sequence qualities will be removed from | |
232 the read. Bases with an X will be reattached to the read. | |
233 | |
234 E.g. If the pattern is `NNNNCC`, | |
235 Then the read:: | |
236 | |
237 @HISEQ:87:00000000 read1 | |
238 AAGGTTGCTGATTGGATGGGCTAG | |
239 + | |
240 DA1AEBFGGCG01DFH00B1FF0B | |
241 | |
242 will become:: | |
243 | |
244 @HISEQ:87:00000000_TT_AAGG read1 | |
245 GCTGATTGGATGGGCTAG | |
246 + | |
247 1AFGGCG01DFH00B1FF0B | |
248 | |
249 where 'TT' is the cell barcode and 'AAGG' is the UMI. | |
250 | |
251 - ``regex`` | |
252 This method allows for more flexible barcode extraction and | |
253 should be used where the cell barcodes are variable in | |
254 length. Alternatively, the regex option can also be used to | |
255 filter out reads which do not contain an expected adapter | |
256 sequence. UMI-tools uses the regex module rather than the more | |
257 standard re module since the former also enables fuzzy matching | |
258 | |
259 The regex must contain groups to define how the barcodes are | |
260 encoded in the read. The expected groups in the regex are: | |
261 | |
262 umi_n = UMI positions, where n can be any value (required) | |
263 cell_n = cell barcode positions, where n can be any value (optional) | |
264 discard_n = positions to discard, where n can be any value (optional) | |
265 | |
266 UMI positions and cell barcode positions will be extracted and | |
267 added to the read name. The corresponding sequence qualities | |
268 will be removed from the read. | |
269 | |
270 Discard bases and the corresponding quality scores will be | |
271 removed from the read. All bases matched by other groups or | |
272 components of the regex will be reattached to the read sequence | |
273 | |
274 For example, the following regex can be used to extract reads | |
275 from the Klein et al inDrop data:: | |
276 | |
277 (?P<cell_1>.{8,12})(?P<discard_1>GAGTGATTGCTTGTGACGCCTT)(?P<cell_2>.{8})(?P<umi_1>.{6})T{3}.* | |
278 | |
279 Where only reads with a 3' T-tail and `GAGTGATTGCTTGTGACGCCTT` in | |
280 the correct position to yield two cell barcodes of 8-12 and 8bp | |
281 respectively, and a 6bp UMI will be retained. | |
282 | |
283 You can also specify fuzzy matching to allow errors. For example if | |
284 the discard group above was specified as below this would enable | |
285 matches with up to 2 errors in the discard_1 group. | |
286 | |
287 :: | |
288 | |
289 (?P<discard_1>GAGTGATTGCTTGTGACGCCTT){s<=2} | |
290 | |
291 Note that all UMIs must be the same length for downstream | |
292 processing with dedup, group or count commands]]></token> | |
293 | |
294 <xml name="barcode_options_macro"> | |
295 <conditional name="bc" > | |
296 <param argument="--extract-umi-method" type="select" label="Umi Extract Method" help="How are the barcodes encoded in the read?" > | |
297 <option value="read_id" selected="true">Barcodes are contained at the end of the read seperated by a delimiter</option> | |
298 <option value="tag" >Barcodes are contained in tags</option> | |
299 <option value="umis" >Barcodes were extracted using umis</option> | |
300 </param> | |
301 <when value="read_id" > | |
302 <param argument="--umi-separator" type="text" label="Delimiter between read id and the UMI" value="_" > | |
303 <sanitizer invalid_char="" > | |
304 <valid initial="string.punctuation" /> | |
305 </sanitizer> | |
306 </param> | |
307 </when> | |
308 <when value="tag" > | |
309 <param argument="--umi-tag" type="text" label="Tag which contains the UMI" value="RX" > | |
310 <expand macro="sanitize_tag" /> | |
311 </param> | |
312 <param argument="--umi-tag-split" type="text" label="Separate the UMI in tag by SPLIT" help="and take the first element"/> | |
313 <param argument="--umi-tag-delimiter" type="text" label="Separate the UMI in tag by DELIMITER" help="and concatenate the elements"/> | |
314 <param argument="--cell-tag" type="text" label="Tag which contains the cell barcode" > | |
315 <expand macro="sanitize_tag" /> | |
316 </param> | |
317 <param argument="--cell-tag-split" type="text" label="Separate the cell barcode in tag by SPLIT" help="and take the first element"/> | |
318 <param argument="--cell-tag-delimiter" type="text" label="Separate the cell barcode in tag by DELIMITER" help="and concatenate the elements"/> | |
319 </when> | |
320 <when value="umis"/> | |
321 </conditional> | |
322 </xml> | |
323 <token name="@BARCODE_OPTIONS@"><![CDATA[ | |
324 --extract-umi-method $bc.extract_umi_method | |
325 #if str($bc.extract_umi_method) == 'read_id': | |
326 --umi-separator '$bc.umi_separator' | |
327 #else if str($bc.extract_umi_method) == 'tag': | |
328 --umi-tag '$bc.umi_tag' | |
329 #if $bc.umi_tag_split != '' | |
330 --umi-tag-split '$bc.umi_tag_split' | |
331 #end if | |
332 #if $bc.umi_tag_delimiter != '' | |
333 --umi-tag-delimiter '$bc.umi_tag_delimiter' | |
334 #end if | |
335 --cell-tag '$bc.cell_tag' | |
336 #if $bc.cell_tag_split != '' | |
337 --cell-tag-split '$bc.cell_tag_split' | |
338 #end if | |
339 #if $bc.cell_tag_delimiter != '' | |
340 --cell-tag-delimiter '$bc.cell_tag_delimiter' | |
341 #end if | |
342 #end if | |
343 ]]></token> | |
344 <token name="@BARCODE_HELP@"><![CDATA[ | |
345 Extracting barcodes | |
346 ------------------- | |
347 | |
348 It is assumed that the FASTQ files were processed with ``umi_tools | |
349 extract`` before mapping and thus the UMI is the last word of the read | |
350 name. e.g: | |
351 | |
352 @HISEQ:87:00000000_AATT | |
353 | |
354 where ``AATT`` is the UMI sequeuence. | |
355 | |
356 If you have used an alternative method which does not separate the | |
357 read id and UMI with a "_", such as bcl2fastq which uses ":", you can | |
358 specify the separator with the option ``--umi-separator=<sep>``, | |
359 replacing <sep> with e.g ":". | |
360 | |
361 Alternatively, if your UMIs are encoded in a tag, you can specify this | |
362 by setting the option --extract-umi-method=tag and set the tag name | |
363 with the --umi-tag option. For example, if your UMIs are encoded in | |
364 the 'UM' tag, provide the following options: | |
365 ``--extract-umi-method=tag`` ``--umi-tag=UM`` | |
366 | |
367 Finally, if you have used umis to extract the UMI +/- cell barcode, | |
368 you can specify ``--extract-umi-method=umis`` | |
369 | |
370 The start position of a read is considered to be the start of its alignment | |
371 minus any soft clipped bases. A read aligned at position 500 with | |
372 cigar 2S98M will be assumed to start at position 498.]]></token> | |
373 | |
374 | |
375 <xml name="umi_grouping_options_macro"> | |
376 <section name="umi" title="UMI grouping options"> | |
377 <param argument="--method" type="select" label="Method used to identify PCR duplicates within reads" help="All methods start by identifying the reads with the same mapping position"> | |
378 <option value="unique">Reads group share the exact same UMI</option> | |
379 <option value="percentile">Reads group share the exact same UMI. UMIs with counts less than 1% of the median counts for UMIs at the same position are ignored</option> | |
380 <option value="cluster">Identify clusters based on hamming distance</option> | |
381 <option value="adjacency">Identify clusters based on hamming distance and resolve networks by using the node counts</option> | |
382 <option value="directional">Identify clusters based on distance and counts, restrict network expansion by threshold</option> | |
383 </param> | |
384 <param argument="--edit-distance-threshold" type="integer" value="1" label="Edit distance threshold" help="For the adjacency and cluster methods the threshold for the edit distance to connect two UMIs in the network can be increased. The default value of 1 works best unless the UMI is very long (>14bp)" /> | |
385 <param argument="--spliced-is-unique" type="boolean" truevalue="--spliced-is-unique" falsevalue="" label="Spliced reads are unique" help="Causes two reads that start in the same position on the same strand and having the same UMI to be considered unique if one is spliced and the other is not. (Uses the 'N' cigar operation to test for splicing)" /> | |
386 <param argument="--soft-clip-threshold" type="integer" value="4" label="Soft clip threshold" help="Mappers that soft clip, will sometimes do so rather than mapping a spliced read if there is only a small overhang over the exon junction. By setting this option, you can treat reads with at least this many bases soft-clipped at the 3' end as spliced" /> | |
387 <param argument="--read-length" type="boolean" truevalue="--read-length" falsevalue="" label="Use the read length as as a criterion when deduping" /> | |
388 </section> | |
389 </xml> | |
390 <token name="@UMI_GROUPING_OPTIONS@"><![CDATA[ | |
391 --method $umi.method | |
392 --edit-distance-threshold $umi.edit_distance_threshold | |
393 $umi.spliced_is_unique | |
394 --soft-clip-threshold $umi.soft_clip_threshold | |
395 $umi.read_length | |
396 ]]></token> | |
397 <token name="@UMI_GROUPING_HELP@"><![CDATA[ | |
398 UMI grouping options | |
399 -------------------- | |
400 | |
401 Grouping Method | |
402 ............... | |
403 | |
404 What method to use to identify group of reads with the same (or | |
405 similar) UMI(s)? | |
406 | |
407 All methods start by identifying the reads with the same mapping position. | |
408 | |
409 The simplest methods, unique and percentile, group reads with | |
410 the exact same UMI. The network-based methods, cluster, adjacency and | |
411 directional, build networks where nodes are UMIs and edges connect UMIs | |
412 with an edit distance <= threshold (usually 1). The groups of reads | |
413 are then defined from the network in a method-specific manner. For all | |
414 the network-based methods, each read group is equivalent to one read | |
415 count for the gene. | |
416 | |
417 - unique | |
418 Reads group share the exact same UMI | |
419 | |
420 - percentile | |
421 Reads group share the exact same UMI. UMIs with counts < 1% of the | |
422 median counts for UMIs at the same position are ignored. | |
423 | |
424 - cluster | |
425 Identify clusters of connected UMIs (based on hamming distance | |
426 threshold). Each network is a read group | |
427 | |
428 - adjacency | |
429 Cluster UMIs as above. For each cluster, select the node (UMI) | |
430 with the highest counts. Visit all nodes one edge away. If all | |
431 nodes have been visited, stop. Otherwise, repeat with remaining | |
432 nodes until all nodes have been visted. Each step | |
433 defines a read group. | |
434 | |
435 - directional (default) | |
436 Identify clusters of connected UMIs (based on hamming distance | |
437 threshold) and umi A counts >= (2* umi B counts) - 1. Each | |
438 network is a read group. | |
439 | |
440 ]]></token> | |
441 | |
442 <xml name="sambam_options_macro"> | |
443 <section name="sambam" title="SAM/BAM options"> | |
444 <param argument="--mapping-quality" type="integer" value="0" label="Minimum mapping quality for a read to be retained"/> | |
445 <param argument="--unmapped-reads" type="select" label="How to handle unmapped reads"> | |
446 <option value="discard">discard</option> | |
447 <option value="use">use</option> | |
448 <option value="correct">correct</option> | |
449 </param> | |
450 <param argument="--chimeric-pairs" type="select" optional="true" label="How to handle chimeric read pairs (default: use)"> | |
451 <option value="discard">discard</option> | |
452 <option value="use">use</option> | |
453 <option value="correct">correct</option> | |
454 </param> | |
455 <param argument="--unpaired-reads" type="select" optional="true" label="How to handle unpaired reads (default: use)"> | |
456 <option value="discard">discard</option> | |
457 <option value="use">use</option> | |
458 <option value="correct">correct</option> | |
459 </param> | |
460 <param argument="--ignore-umi" type="boolean" truevalue="--ignore-umi" falsevalue="" label="Ignore UMI and dedup only on position"/> | |
461 <param argument="--ignore-tlen" type="boolean" truevalue="--ignore-tlen" falsevalue="" label="Dedup paired end reads based solely on read1" help="whether or not the template length is the same"/> | |
462 <param argument="--chrom" type="text" value="" label="Consider only chromosome" help="If a value is given only a single chromosome with the given name is considered"/> | |
463 <param argument="--subset" type="float" min="0.0" max="1.0" value="1.0" label="Only consider a random selection of the reads" /> | |
464 <!--in-sam is set automatically--> | |
465 <param argument="--paired" type="boolean" truevalue="--paired" falsevalue="" label="BAM is paired end" help="This will also force the use of the template length to determine reads with the same mapping coordinates" /> | |
466 </section> | |
467 </xml> | |
468 <token name="@SAMBAM_OPTIONS@"><![CDATA[ | |
469 --mapping-quality $sambam.mapping_quality | |
470 --unmapped-reads $sambam.unmapped_reads | |
471 #if $sambam.chimeric_pairs | |
472 --chimeric-pairs $sambam.chimeric_pairs | |
473 #end if | |
474 #if $sambam.unpaired_reads | |
475 --unpaired-reads $sambam.unpaired_reads | |
476 #end if | |
477 $sambam.ignore_umi | |
478 $sambam.ignore_tlen | |
479 #if str($sambam.chrom) != '' | |
480 --chrom '$sambam.chrom' | |
481 #end if | |
482 --subset $sambam.subset | |
483 $sambam.paired | |
484 @SET_INPUT_TYPE@ | |
485 ]]></token> | |
486 | |
487 <!-- per-gene is hard coded in count https://github.com/CGATOxford/UMI-tools/blob/c3ead0792ad590822ca72239ef01b8e559802da9/umi_tools/count.py#L92 | |
488 hence we need a specialized macro here | |
489 TODO count used XF as default for gene-tag now I set it explicitly for the tests but we could as well parametrize the macro and set tool specific defaults | |
490 --> | |
491 | |
492 <xml name="fullsc_options_macro"> | |
493 <expand macro="sc_options_macro"> | |
494 <param argument="--per-gene" type="boolean" truevalue="--per-gene" falsevalue="" label="Deduplicate per gene" | |
495 help="Must combine with either --gene-tag or --per-contig. As for --per-contig except with this option you can align to a reference transcriptome with more than one transcript per gene. You need to also provide a map of genes to transcripts. This will also add a metacontig ('MC') tag to the output BAM file" /> | |
496 </expand> | |
497 </xml> | |
498 <token name="@FULLSC_OPTIONS@"><![CDATA[ | |
499 $sc.per_gene | |
500 @SC_OPTIONS@ | |
501 ]]></token> | |
502 | |
503 <xml name="sc_options_macro"> | |
504 <section name="sc" title="Single-cell RNA-Seq options"> | |
505 <yield/> | |
506 <param argument="--gene-tag" type="text" optional="true" label="Deduplicate by this gene tag" help="As --per-gene except here the gene information is encoded in the bam read tag specified so you do not need to supply the mapping file"> | |
507 <expand macro="sanitize_tag" /> | |
508 </param> | |
509 <param argument="--assigned-status-tag" type="text" optional="true" label="Bam tag describing whether read is assigned to a gene" help="By default, this is set as the same tag as --gene-tag"> | |
510 <expand macro="sanitize_tag" /> | |
511 </param> | |
512 <param argument="--skip-tags-regex" name="skip_tags_regex" type="text" label="Skip any reads where the gene matches this tag" value="" > | |
513 <expand macro="barcode_sanitizer" /> | |
514 </param> | |
515 <param argument="--per-contig" type="boolean" truevalue="--per-contig" falsevalue="" label="Deduplicate per contig" help="Field 3 in BAM; RNAME. All reads with the same contig will be considered to have the same alignment position. This is useful if your library prep generates PCR duplicates with non identical alignment positions such as CEL-Seq. In this case, you would align to a reference transcriptome with one transcript per gene" /> | |
516 <param argument="--gene-transcript-map" type="data" format="tabular" optional="true" label="Tabular file mapping genes to transripts" /> | |
517 <param argument="--per-cell" name="per_cell" type="boolean" truevalue="--per-cell" falsevalue="" label="Group reads only if they have the same cell barcode" /> | |
518 </section> | |
519 </xml> | |
520 <token name="@SC_OPTIONS@"><![CDATA[ | |
521 #if str($sc.gene_tag) != "": | |
522 --gene-tag '$sc.gene_tag' | |
523 #end if | |
524 #if str($sc.assigned_status_tag) != "": | |
525 --assigned-status-tag '$sc.assigned_status_tag' | |
526 #end if | |
527 #if str($sc.skip_tags_regex) != "": | |
528 --skip-tags-regex '$sc.skip_tags_regex' | |
529 #end if | |
530 $sc.per_contig | |
531 #if $sc.gene_transcript_map: | |
532 --gene-transcript-map '$sc.gene_transcript_map' | |
533 #end if | |
534 $sc.per_cell | |
535 ]]></token> | |
536 | |
537 <xml name="groupdedup_options_macro"> | |
538 <section name="gd" title="group/dedup specific options"> | |
539 <param argument="--buffer-whole-contig" type="boolean" truevalue="--buffer-whole-contig" falsevalue="" label="Read whole contig before outputting bundles" help="Guarantees that no reads are missed, but increases memory usage" /> | |
540 <!-- TODO this option is hidden on the CLI. Should we expose it? --> | |
541 <param argument="--whole-contig" type="boolean" truevalue="--whole-contig" falsevalue="" label="Consider all alignments to a single contig together" help="This is useful if you have aligned to a transcriptome multi-fasta" /> | |
542 <param argument="--multimapping-detection-method" type="select" optional="true" label="BAM Tag indicating multimapping " help="Some aligners identify multimapping using bam tags. Setting this option to NH, X0 or XT will use these tags when selecting the best read amongst reads with the same position and umi"> | |
543 <option value="NH">NH</option> | |
544 <option value="X0">X0</option> | |
545 <option value="XT">XT</option> | |
546 </param> | |
547 </section> | |
548 </xml> | |
549 <token name="@GROUPDEDUP_OPTIONS@"><![CDATA[ | |
550 $gd.buffer_whole_contig | |
551 $gd.whole_contig | |
552 $gd.multimapping_detection_method | |
553 ]]></token> | |
554 | |
555 <xml name="log_input_macro"> | |
556 <param argument="--log" type="boolean" label="Output log?" truevalue="--log" falsevalue="" help="Choose if you want to generate a text file containing logging information" /> | |
557 </xml> | |
558 <xml name="log_output_macro"> | |
559 <data name="out_log" format="txt" label="${tool.name} on ${on_string}: logfile" > | |
560 <filter>log</filter> | |
561 </data> | |
562 </xml> | |
563 <token name="@LOG@"><![CDATA[ | |
564 #if $log: | |
565 --log='$out_log' | |
566 #end if | |
567 --log2stderr | |
568 ]]></token> | |
569 | |
112 </macros> | 570 </macros> |