Mercurial > repos > iuc > qualimap_bamqc
comparison qualimap_bamqc.xml @ 0:ac607906f10a draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/qualimap commit b4d43001cc0caa14d760c347fa1c416929f769b2"
author | iuc |
---|---|
date | Thu, 10 Oct 2019 17:42:04 -0400 |
parents | |
children | 4a89c6f84425 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ac607906f10a |
---|---|
1 <tool id="qualimap_bamqc" name="QualiMap BamQC" version="@VERSION@"> | |
2 <macros> | |
3 <import>qualimap_macros.xml</import> | |
4 </macros> | |
5 <expand macro="requirements" /> | |
6 <expand macro="version_command" /> | |
7 <command detect_errors="exit_code"><![CDATA[ | |
8 #import os | |
9 @SET_JAVA_OPTS@ && | |
10 | |
11 ## Set some default file names and paths | |
12 ## where we expect tool output to end up. | |
13 ## Note that most of these need to be overwritten if the user is | |
14 ## interested in regions *outside* those defined in a custom regions | |
15 ## file. | |
16 #set $out_dir = 'results' | |
17 #set $report_name = 'qualimapReport' | |
18 #set $summary_report = 'genome_results.txt' | |
19 #set $coverage_file = os.path.join($out_dir, 'coverage.txt') | |
20 ## This is the only file path that qualimap does not calculate | |
21 ## from $out_dir. | |
22 #set $per_base_coverage_target = $coverage_file | |
23 | |
24 qualimap bamqc | |
25 -bam '$input1' -outdir results -outformat html | |
26 --collect-overlap-pairs | |
27 #if str($stats_regions.region_select) == 'custom_regions': | |
28 -gff ${stats_regions.regions} | |
29 #if $stats_regions.outside_stats: | |
30 #set $report_name = 'qualimapReportOutsideRegions' | |
31 #set $summary_report = 'outside_results.txt' | |
32 #set $coverage_file = os.path.join( | |
33 $out_dir, 'outside_coverage.txt' | |
34 ) | |
35 #if $per_base_coverage: | |
36 #set $per_base_coverage_target = '/dev/null' | |
37 #end if | |
38 ${stats_regions.outside_stats} | |
39 #end if | |
40 #end if | |
41 #if $per_base_coverage: | |
42 $per_base_coverage $per_base_coverage_target | |
43 #end if | |
44 -nw ${plot_specific.n_bins} | |
45 ${plot_specific.paint_chromosome_limits} | |
46 #if $plot_specific.genome_gc_distr: | |
47 --genome-gc-distr ${plot_specific.genome_gc_distr} | |
48 #end if | |
49 -hm ${plot_specific.homopolymer_size} | |
50 | |
51 #if $duplicate_skipping: | |
52 --skip-duplicated | |
53 #if str($duplicate_skipping) == '0,1': | |
54 --skip-dup-mode 2 | |
55 #else: | |
56 --skip-dup-mode ${duplicate_skipping} | |
57 #end if | |
58 #end if | |
59 -nt \${GALAXY_SLOTS:-1} && | |
60 | |
61 #if $per_base_coverage: | |
62 mv $coverage_file '$output_per_base_coverage' && | |
63 #end if | |
64 @MASSAGE_OUTPUT@ | |
65 ]]></command> | |
66 <inputs> | |
67 <param argument="-bam" name="input1" type="data" format="bam" | |
68 label="Mapped reads input dataset" /> | |
69 <conditional name="stats_regions"> | |
70 <param name="region_select" type="select" label="Reference genome regions to calculate mapping statistics for"> | |
71 <option value="all">All (whole genome)</option> | |
72 <option value="custom_regions">Select regions</option> | |
73 </param> | |
74 <when value="all" /> | |
75 <when value="custom_regions"> | |
76 <param argument="-gff" name="regions" type="data" format="gff,gtf,bed" | |
77 label="Dataset specifying regions" /> | |
78 <param argument="-os" name="outside_stats" type="boolean" truevalue="--outside-stats" falsevalue="" checked="false" | |
79 label="Invert regions" | |
80 help="If selected, report read statistics *outside* the regions in the regions file." /> | |
81 </when> | |
82 </conditional> | |
83 <param argument="-oc" name="per_base_coverage" type="boolean" truevalue="--output-genome-coverage" falsevalue="" checked="false" | |
84 label="Generate per-base coverage output" | |
85 help="Produce additional tabular output listing the coverage at every site (omitting only zero-coverage positions) in the selected regions of the genome. Caution: Will generate a huge dataset for anything but small input genomes or restricted regions!" /> | |
86 <param argument="--skip-dup-mode" name="duplicate_skipping" type="select" display="checkboxes" multiple="true" optional="true" | |
87 label="Skip duplicate reads"> | |
88 <option value="0" selected="true">Reads flagged as duplicates in input</option> | |
89 <option value="1">Duplicates detected by Qualimap</option> | |
90 </param> | |
91 <section name="plot_specific" title="Settings affecting specific plots" expanded="false"> | |
92 <param argument="-nw" name="n_bins" type="integer" value="400" | |
93 label="Number of bins to use in across-reference plots" | |
94 help="Affected plots: Coverage, Mapping Quality and Insert Size across reference, Mapped reads GC-content distribution; the value determines the resolution of the affected plots. Note: The lower the value, the higher the memory usage of the tool!" /> | |
95 <param argument="-c" name="paint_chromosome_limits" type="boolean" truevalue="--paint-chromosome-limits" falsevalue="" checked="true" | |
96 label="Draw chromosome limits" | |
97 help="Affected plots: Coverage, Mapping Quality and Insert Size across reference; in across-reference plots, indicate chromosome boundaries with dotted lines and labels" /> | |
98 <param argument="-gd" name="genome_gc_distr" type="select" optional="true" | |
99 label="Plot expected GC-content distribution of the following reference genome" | |
100 help="Affected plot: Mapped reads GC-content distribution; include a precalculated GC-content distribution for the selected (Qualimap-supported) reference genome in the plot"> | |
101 <option value="hg19">Human genome (hg19)</option> | |
102 <option value="mm9">Mouse genome (mm9)</option> | |
103 <option value="mm10">Mouse genome (mm10)</option> | |
104 </param> | |
105 <param argument="-hm" name="homopolymer_size" type="integer" value="3" min="2" | |
106 label="Homopolymer size" | |
107 help="Affected plot: Homopolymer indels; sets the minimal number of consecutive bases that define a homopolymer" /> | |
108 </section> | |
109 </inputs> | |
110 <outputs> | |
111 <data name="output_html" format="html" | |
112 label="${tool.name} report on ${on_string}" /> | |
113 <data name="output_per_base_coverage" format="tsv" | |
114 label="${tool.name} per-base coverage on ${on_string}"> | |
115 <filter>per_base_coverage</filter> | |
116 </data> | |
117 <collection name="raw_data" type="list" | |
118 label="Raw data for ${tool.name} on ${on_string}"> | |
119 <data name="genome_results" format="txt" from_work_dir="results/summary_report.txt" /> | |
120 <data name="coverage_across_reference" format="tsv" from_work_dir="results/coverage_across_reference.txt" /> | |
121 <data name="coverage_histogram" format="tsv" from_work_dir="results/coverage_histogram.txt" /> | |
122 <data name="genome_fraction_coverage" format="tsv" from_work_dir="results/genome_fraction_coverage.txt" /> | |
123 <data name="duplication_rate_histogram" format="tsv" from_work_dir="results/duplication_rate_histogram.txt" /> | |
124 <data name="mapped_reads_clipping_profile" format="tsv" from_work_dir="results/mapped_reads_clipping_profile.txt" /> | |
125 <data name="mapped_reads_gc-content_distribution" format="tsv" from_work_dir="results/mapped_reads_gc-content_distribution.txt" /> | |
126 <data name="mapped_reads_nucleotide_content" format="tsv" from_work_dir="results/mapped_reads_nucleotide_content.txt" /> | |
127 <data name="mapping_quality_across_reference" format="tsv" from_work_dir="results/mapping_quality_across_reference.txt" /> | |
128 <data name="mapping_quality_histogram" format="tsv" from_work_dir="results/mapping_quality_histogram.txt" /> | |
129 </collection> | |
130 </outputs> | |
131 <tests> | |
132 <test expect_num_outputs="12"> | |
133 <param name="input1" value="test_mapped_reads.bam"/> | |
134 <output name="output_html" ftype="html"> | |
135 <assert_contents> | |
136 <has_text text="Qualimap report: BAM QC" /> | |
137 </assert_contents> | |
138 </output> | |
139 <output_collection name="raw_data" type="list"> | |
140 <element name="genome_results" file="genome_results_default.txt" ftype="txt" compare="diff" lines_diff="2" /> | |
141 </output_collection> | |
142 </test> | |
143 <test expect_num_outputs="13"> | |
144 <param name="input1" value="test_mapped_reads.bam" /> | |
145 <param name="per_base_coverage" value="true" /> | |
146 <output name="output_html" ftype="html"> | |
147 <assert_contents> | |
148 <has_text text="Qualimap report: BAM QC" /> | |
149 </assert_contents> | |
150 </output> | |
151 <output name="output_per_base_coverage" file="per_base_coverage_default.txt" ftype="tsv" /> | |
152 <output_collection name="raw_data" type="list"> | |
153 <element name="genome_results" file="genome_results_default.txt" ftype="txt" compare="diff" lines_diff="2" /> | |
154 </output_collection> | |
155 </test> | |
156 <test expect_num_outputs="12"> | |
157 <param name="input1" value="test_mapped_reads.bam"/> | |
158 <conditional name="stats_regions"> | |
159 <param name="region_select" value="custom_regions" /> | |
160 <param name="regions" value="features.gtf" /> | |
161 </conditional> | |
162 <output name="output_html" ftype="html"> | |
163 <assert_contents> | |
164 <has_text text="Qualimap report: BAM QC" /> | |
165 </assert_contents> | |
166 </output> | |
167 <output_collection name="raw_data" type="list"> | |
168 <element name="genome_results" file="genome_results_inside_features.txt" ftype="txt" compare="diff" lines_diff="2" /> | |
169 </output_collection> | |
170 </test> | |
171 <test expect_num_outputs="13"> | |
172 <param name="input1" value="test_mapped_reads.bam" /> | |
173 <conditional name="stats_regions"> | |
174 <param name="region_select" value="custom_regions" /> | |
175 <param name="regions" value="features.gtf" /> | |
176 </conditional> | |
177 <param name="per_base_coverage" value="true" /> | |
178 <output name="output_html" ftype="html"> | |
179 <assert_contents> | |
180 <has_text text="Qualimap report: BAM QC" /> | |
181 </assert_contents> | |
182 </output> | |
183 <output name="output_per_base_coverage" file="per_base_coverage_inside_features.txt" ftype="tsv" /> | |
184 <output_collection name="raw_data" type="list"> | |
185 <element name="genome_results" file="genome_results_inside_features.txt" ftype="txt" compare="diff" lines_diff="2" /> | |
186 </output_collection> | |
187 </test> | |
188 <test expect_num_outputs="13"> | |
189 <param name="input1" value="test_mapped_reads.bam" /> | |
190 <conditional name="stats_regions"> | |
191 <param name="region_select" value="custom_regions" /> | |
192 <param name="regions" value="features.gtf" /> | |
193 <param name="outside_stats" value="true" /> | |
194 </conditional> | |
195 <param name="per_base_coverage" value="true" /> | |
196 <output name="output_html" ftype="html"> | |
197 <assert_contents> | |
198 <has_text text="Qualimap report: BAM QC" /> | |
199 </assert_contents> | |
200 </output> | |
201 <output name="output_per_base_coverage" file="per_base_coverage_outside_features.txt" ftype="tsv" /> | |
202 <output_collection name="raw_data" type="list"> | |
203 <element name="genome_results" file="genome_results_outside_features.txt" ftype="txt" compare="diff" lines_diff="2" /> | |
204 </output_collection> | |
205 </test> | |
206 </tests> | |
207 <help><![CDATA[ | |
208 **What it does** | |
209 | |
210 **Qualimap BAM QC** lets you evaluate the quality of aligned reads data in BAM | |
211 format. The tool summarizes basic statistics of the alignment (number of reads, | |
212 coverage, GC-content, etc.) and produces a number of useful graphs for their | |
213 interpretation. | |
214 | |
215 The analysis can be performed with any kind of sequencing data, such as | |
216 whole-genome sequencing, exome sequencing, RNA-seq or ChIP-seq data. | |
217 | |
218 In addition, it is possible to provide an annotation file so the results are | |
219 computed for the reads mapping inside (and optionally outside) of the | |
220 corresponding genomic regions, which can be especially useful for evaluating | |
221 target-enrichment sequencing studies. | |
222 | |
223 Input | |
224 ===== | |
225 | |
226 *Mapped reads input dataset* | |
227 | |
228 The dataset holding the mapped reads to carry out the analysis with. | |
229 | |
230 *Dataset specifying regions* | |
231 | |
232 If you decide to calculate mapping statistics for selected regions of the | |
233 reference genome (instead of for the whole genome), you need to specify the | |
234 regions through this additional dataset in gtf, gff or bed format. | |
235 | |
236 .. class:: infomark | |
237 | |
238 A typical problem when working with regions (and genome annotation data, in general) is potential inconsistency between the chromosome names used in the mapped reads input versus those used to define the regions. In the case of the human genome, for example, UCSC data has chromosomes starting with a 'chr' prefix, which is lacking from Ensemble data. This simple form of the problem is handled by Qualimap: if chromosome names in the regions input have a 'chr' prefix, Qualimap will add that prefix to the mapped reads chromosome names as needed. For more complex cases you will have to adjust your inputs manually. | |
239 | |
240 | |
241 Parameters | |
242 ---------- | |
243 | |
244 *Reference genome regions to calculate mapping statistics for* | |
245 | |
246 Choose whether you would like to have mapping statistics reported across | |
247 | |
248 - the entire reference genome | |
249 (as specified in the header of the mapped reads input) | |
250 | |
251 - specific regions of the reference | |
252 | |
253 In the second case, you need to select a *Dataset specifying regions* (see | |
254 above). Using the *Invert regions* switch you can then indicate whether you | |
255 want to select or exclude the regions in this dataset. | |
256 | |
257 *Generate per-base coverage output* | |
258 | |
259 *Skip duplicate reads* | |
260 | |
261 The tool lets you skip alignments of duplicate reads from the analysis. | |
262 Depending on whether you select none, either one, or both of the available | |
263 options, you can decide to: | |
264 | |
265 - not correct for duplicate reads at all (*e.g.* because you have removed them | |
266 at an earlier step with some dedicated tool) | |
267 - identify and flag duplicate reads with a dedicated tool (like ``Picard | |
268 MarkDuplicates`` or ``samtools markdup``), then have Qualimap ignore the | |
269 duplicate-flagged reads (recommended, most flexible option since other tools | |
270 can be told to ignore the same reads) | |
271 - have Qualimap identify potential duplicates by itself and ignore them | |
272 - combine external and Qualimap-internal duplicate detection for extra | |
273 stringency | |
274 | |
275 Independent of your selection, the HTML report will always list (in the | |
276 `Globals` section of the `Summary`) the number of duplicated reads estimated by | |
277 Qualimap. If you choose to skip duplicates, you will also be informed about the | |
278 number of skipped reads in that same section and, if you instruct Qualimap to | |
279 look for the duplicate flag on reads, the number of reads flagged as duplicates | |
280 will also be reported here. | |
281 | |
282 **Section: Settings affecting specific plots** | |
283 | |
284 Parameters in this section only affect some (or even only one) of the plots | |
285 contained in the HTML report (and the corresponding part of the *Raw Data* | |
286 output collection). | |
287 | |
288 For most of these options, the parameter help above should be descriptive | |
289 enough. Just a few more words on two of them: | |
290 | |
291 *Number of bins to use in across-reference plots* | |
292 | |
293 This value is used for computing the various graphs that plot information | |
294 across the reference. Basically, the reference genome gets split into the given | |
295 number of bins, and reads falling in the same bin are aggregated in the | |
296 statistics of that bin. | |
297 | |
298 Thus, the higher the number of bins, the higher the resolution of the plots, | |
299 but more bins also require longer time for their statistics to be computed. | |
300 Less bins, on the other hand, mean more reads will have to be aggregated per | |
301 bin and this comes with higher memory requirements. Hence, if the tool fails | |
302 with an ``Out Of Memory`` error, you may want to rerun it with a higher bin | |
303 number. | |
304 | |
305 *Plot expected GC-content distribution of the following reference genome* | |
306 | |
307 The choice of reference genomes with pre-calculated GC distributions is built | |
308 into Qualimap. | |
309 | |
310 Future releases of Qualimap may include more choices, but the current version | |
311 is limited to those offered here. | |
312 | |
313 | |
314 Outputs | |
315 ======= | |
316 | |
317 HTML Report | |
318 ----------- | |
319 | |
320 **Summary Section** | |
321 | |
322 *Globals* | |
323 | |
324 This section contains information about the total number of reads, number of mapped reads, paired-end mapping performance, read length distribution, | |
325 number of clipped reads and duplication rate (estimated from the start positions of read alignments). | |
326 | |
327 *ACGT Content* | |
328 | |
329 Nucleotide content and GC percentage in the mapped reads. | |
330 | |
331 *Coverage* | |
332 | |
333 Mean and standard deviation of the coverage depth. | |
334 | |
335 *Mapping quality* | |
336 | |
337 Mean mapping quality of the mapped reads. | |
338 | |
339 *Insert size* | |
340 | |
341 Mean, standard deviation and percentiles of the insert size distribution if applicable. The features are computed based on the TLEN field of the SAM file. | |
342 | |
343 *Mismatches and indels* | |
344 | |
345 The section reports general alignment error rate (computed as a ratio of total collected edit distance to the number of mapped bases), total number of mismatches and total number of indels (computed from the CIGAR values). Additionally fraction of the homopolymer indels among total indels is provided. Note, the error rate and mismatches metrics are based on optional fields of a SAM record (NM for edit distance, MD for mismatches). The features are not reported if these fields are missing in the SAM file. | |
346 | |
347 *Chromosome stats* | |
348 | |
349 Number of mapped bases, mean and standard deviation of the coverage depth for each chromosome as defined by the header of the SAM file. | |
350 | |
351 For region-based analysis the information is given inside of regions, including some additional information like, for example, number of correct strand reads. | |
352 | |
353 | |
354 **Plots** | |
355 | |
356 *Coverage Across Reference* | |
357 | |
358 This plot consists of two figures. | |
359 The upper figure provides the coverage distribution (red line) and coverage | |
360 deviation across the reference sequence. | |
361 The lower figure shows GC content across reference (black line) together with | |
362 its average value (red dotted line). | |
363 | |
364 *Coverage Histogram* | |
365 | |
366 Histogram of the number of genomic locations having a given coverage rate. | |
367 The bins of the x-axis are conveniently scaled by aggregating some coverage | |
368 values in order to produce a representative histogram also in presence of the | |
369 usual NGS peaks of coverage. | |
370 | |
371 *Coverage Histogram (0-50X)* | |
372 | |
373 Similar to the previous plot, but in this graph genome locations with a | |
374 coverage greater than 50X are grouped into the last bin. | |
375 By doing so a higher resolution of the most common values for the coverage rate | |
376 is obtained. | |
377 | |
378 *Genome Fraction Coverage* | |
379 | |
380 Provides a visual way of knowing how much reference has been sequenced to at | |
381 least a given coverage rate. | |
382 This graph should be interpreted as in this example: | |
383 If one aims for a coverage rate of at least 25X (x-axis), how much of the | |
384 reference (y-axis) will be considered? | |
385 | |
386 *Duplication Rate Histogram* | |
387 | |
388 This plot shows the distribution of duplicated reads. | |
389 Due to several factors (*e.g.* amount of starting material, sample preparation, | |
390 *etc.*) it is possible that the same fragments are sequenced several times. | |
391 For some experiments where enrichment is used (*e.g.* ChIP-seq ) this is | |
392 expected to some degree. | |
393 For most experiments, however, a high duplication level of the reads indicates | |
394 some unwanted bias. | |
395 | |
396 *Mapped Reads Nucleotide Content* | |
397 | |
398 This plot shows the nucleotide content per position of the mapped reads. | |
399 | |
400 *Mapped Reads GC Content Distribution* | |
401 | |
402 This graph shows the distribution of GC-content per mapped read. | |
403 If compared with a precomputed genome distribution, this plot allows to check | |
404 if there is a shift in the GC content. | |
405 | |
406 *Mapped Reads Clipping Profile* | |
407 | |
408 Represents the percentage of clipped bases across the reads. | |
409 Technically, the clipping is detected via SAM format CIGAR codes ‘H’ | |
410 (hard clipping) and ‘S’ (soft clipping). | |
411 In addition, the total number of clipped reads can be found in the report | |
412 `Summary` section. | |
413 | |
414 This plot is not shown if no clipped reads are found. | |
415 | |
416 *Homopolymer Indels* | |
417 | |
418 This bar plot shows the number of indels that are located within A, C, G and T | |
419 homopolymers, respectively, as well as the number of indels that are not within | |
420 any homopolymer. Large numbers of homopolymer indels may indicate a problem in | |
421 the sequencing process. | |
422 Technically, Qualimap identifies indels from the CIGAR code of the aligned | |
423 reads. Indel statistics can also be found in a dedicated section of the report | |
424 `Summary`. | |
425 | |
426 This graph is not shown if the sample doesn’t contain any indels. | |
427 | |
428 *Mapping Quality Across Reference* | |
429 | |
430 This plot provides the mapping quality distribution across the reference. | |
431 To construct the plot, the mean mapping quality is computed for each bin. | |
432 | |
433 *Mapping Quality Histogram* | |
434 | |
435 Histogram of the number of genomic locations having a given mapping quality. | |
436 To construct the histogram the mean mapping quality is computed at each genome | |
437 position with non-zero coverage and collected. | |
438 According to the SAM/BAM format specifications, the range for the mapping | |
439 quality score is [0-255]. | |
440 | |
441 *Insert Size Across Reference* | |
442 | |
443 This plot provides the insert size distribution across the reference. | |
444 Technically, the insert size of each pair of aligned reads is collected from | |
445 the SAM alignment field `TLEN`. Only positive values are taken into account. | |
446 To construct the plot, the mean insert size is computed for each bin. | |
447 | |
448 *Insert Size Histogram* | |
449 | |
450 Histogram of insert size distribution. | |
451 | |
452 | |
453 Raw Data | |
454 -------- | |
455 | |
456 This is a *Collection* of 10 individual datasets. | |
457 | |
458 The *genome_results* dataset provides a plain-text summary of key statistics, | |
459 most of which can also be found in the *Summary* section of the *HTML Report*. | |
460 | |
461 The remaining 9 datasets hold the tabular raw data underlying the plots of the corresponding names in the *HTML Report*. | |
462 | |
463 | |
464 Per-base coverage | |
465 ----------------- | |
466 | |
467 Optional. This is a tabular dataset listing the coverage of every base in the | |
468 reference genome unless that coverage is zero. Since its content is | |
469 uncompressed text, this dataset can easily become huge, and it is recommended | |
470 that you generate this dataset only for very small genomes or very limited | |
471 regions of larger genomes. | |
472 ]]> </help> | |
473 <expand macro="citations"/> | |
474 </tool> |