diff qualimap_rnaseq.xml @ 0:613e6446ea5d draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/qualimap commit b4d43001cc0caa14d760c347fa1c416929f769b2"
author iuc
date Thu, 10 Oct 2019 17:41:10 -0400
parents
children ce0da6c9f49e
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/qualimap_rnaseq.xml	Thu Oct 10 17:41:10 2019 -0400
@@ -0,0 +1,383 @@
+<tool id="qualimap_rnaseq" name="QualiMap RNA-Seq QC" version="@VERSION@">
+    <macros>
+        <import>qualimap_macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="version_command" />
+    <command detect_errors="exit_code"><![CDATA[
+        @SET_JAVA_OPTS@ &&
+
+        qualimap rnaseq
+        -bam '${seq_info.input}'
+        -gtf '$features'
+        ${seq_info.treat_as_pe}
+        ${seq_info.sorted}
+        ${counts_out.report_counts}
+        --sequencing-protocol ${read_filtering.library_type}
+        --algorithm ${read_filtering.treat_multimappers}
+        -outdir results -outformat html &&
+
+        #set $report_name = 'qualimapReport'
+        #set $summary_report = 'rnaseq_qc_results.txt'
+        #if str($counts_out.report_counts):
+            #set $ccol_name = str($counts_out.ccol_name).strip() or str($seq_info.input.name).replace(' ', '_')
+            printf '#GeneID\t%s\n' '$ccol_name' > '$output_counts' &&
+            cat results/counts.txt >> '$output_counts' &&
+        #end if
+        @MASSAGE_OUTPUT@
+    ]]></command>
+    <inputs>
+        <conditional name="seq_info">
+            <param argument="-pe" name="treat_as_pe" type="select"
+            label="Counting mode"
+            help="You will usually want to choose 'Count fragments' for paired-end data. For single-end data, choose 'Count reads'. See tool help below.">
+                <option value="">Count reads</option>
+                <option value="--paired">Count fragments</option>
+            </param>
+            <when value="">
+                <param argument="-bam" name="input" type="data" format="bam"
+                label="Mapped reads input dataset" />
+                <param name="sorted" type="hidden" value="" />
+            </when>
+            <when value="--paired">
+                <param argument="-bam" name="input" type="data" format="qname_sorted.bam"
+                label="Mapped reads input dataset" />
+                <param name="sorted" type="hidden" value="--sorted" />
+            </when>
+        </conditional>
+        <param argument="-gtf" name="features" type="data" format="gtf"
+        label="Genome annotation data" />
+        <conditional name="counts_out">
+            <param argument="-oc" name="report_counts" type="select"
+            label="Keep the per-gene counts data?"
+            help="The resulting dataset can, for example, serve as input to QualiMap Counts QC for further assessment.">
+                <option value="">No, just report statistics</option>
+                <option value="-oc counts.txt">Yes, generate separate counts output</option>
+            </param>
+            <when value="" />
+            <when value="-oc counts.txt">
+                <param name="ccol_name" type="text"
+                label="Name to use for the counts column"
+                help="Consider using the name of the analyzed sample here. Default: Name of the mapped reads input dataset in the history" />
+            </when>
+        </conditional>
+        <section name="read_filtering" title="Read selection for counting" expanded="true">
+            <param argument="-p" name="library_type" type="select" display="radio"
+            label="Strandedness">
+                <option value="non-strand-specific">Count reads/fragments independent of strandedness</option>
+                <option value="strand-specific-forward">Count only reads/fragments expected in forward-stranded data</option>
+                <option value="strand-specific-reverse">Count only reads/fragments expected in reverse-stranded data</option>
+            </param>
+            <param argument="-a" name="treat_multimappers" type="select" display="radio"
+            label="Multimapping reads">
+                <option value="uniquely-mapped-reads">Count uniquely mapped reads only</option>
+                <option value="proportional">Count also multimapping reads</option>
+            </param>
+        </section>
+    </inputs>
+    <outputs>
+        <data name="output_html" format="html"
+        label="${tool.name} report on ${on_string}" />
+        <data name="output_counts" format="tsv"
+        label="${tool.name} counts on ${on_string}">
+            <filter>str(counts_out['report_counts'])</filter>
+        </data>
+        <collection name="raw_data" type="list"
+        label="Raw data for ${tool.name} on ${on_string}">
+            <data name="rnaseq_qc_results" format="txt" from_work_dir="results/summary_report.txt" />
+            <data name="coverage_profile_along_genes_high" format="tsv" from_work_dir="results/coverage_profile_along_genes_high.txt" />
+            <data name="coverage_profile_along_genes_low" format="tsv" from_work_dir="results/coverage_profile_along_genes_low.txt" />
+            <data name="coverage_profile_along_genes_total" format="tsv" from_work_dir="results/coverage_profile_along_genes_total.txt" />
+        </collection>
+    </outputs>
+    <tests>
+        <test expect_num_outputs="6">
+            <conditional name="seq_info">
+                <param name="treat_as_pe" value="" />
+                <param name="input" value="test_mapped_reads.bam" />
+            </conditional>
+            <param name="features" value="features.gtf" />
+            <output name="output_html" ftype="html">
+                <assert_contents>
+                    <has_text text="Qualimap report: RNA Seq QC" />
+                </assert_contents>
+            </output>
+            <output_collection name="raw_data" type="list">
+                <element name="rnaseq_qc_results" file="rnaseq_qc_results_default.txt" ftype="txt" compare="diff" lines_diff="4" />
+            </output_collection>
+        </test>
+        <test expect_num_outputs="7">
+            <conditional name="seq_info">
+                <param name="treat_as_pe" value="--paired" />
+                <param name="input" value="test_mapped_reads.bam" />
+            </conditional>
+            <param name="features" value="features.gtf" />
+            <conditional name="counts_out">
+                <param name="report_counts" value="-oc counts.txt" />
+                <param name="ccol_name" value="try_this" />
+            </conditional>
+            <section name="read_filtering">
+                <param name="library_type" value="strand-specific-forward" />
+                <param name="treat_multimappers" value="proportional" />
+            </section>
+            <output name="output_html" ftype="html">
+                <assert_contents>
+                    <has_text text="Qualimap report: RNA Seq QC" />
+                </assert_contents>
+            </output>
+            <output name="output_counts" file="rnaseq_qc_counts_custom.txt" ftype="tsv" />
+            <output_collection name="raw_data" type="list">
+                <element name="rnaseq_qc_results" file="rnaseq_qc_results_custom.txt" ftype="txt" compare="diff" lines_diff="4" />
+            </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+**What it does**
+
+**Qualimap RNA-Seq QC** reports quality control metrics and bias estimations
+which are specific for whole transcriptome sequencing, including reads genomic
+origin, junction analysis, transcript coverage and 5’-3’ bias computation.
+As such, the tool complements the more general analysis with QualiMap BamQC,
+and its (optional) gene counts output can be analyzed further with QualiMap
+Counts QC.
+
+
+Input
+=====
+
+*Mapped reads input dataset*
+
+The dataset holding the mapped reads to carry out the analysis with. Typically,
+this will have been produced by a splicing-aware aligner like *HISAT2* or *RNA
+STAR*.
+
+*Genome annotation data*
+
+A GTF dataset of genomic features that mapped reads should be counted for.
+
+
+Parameters
+----------
+
+*Counting mode*
+
+Determines whether reads should be counted individually, or whether multiple
+reads originating from the same sequencing template (*i.e.*, the read and its
+mate in paired-end sequencing) should be counted as one.
+
+You will usually want to choose ``Count fragments`` for paired-end data. For
+single-end data, choose ``Count reads``.
+
+*Keep the per-gene counts data?*
+
+Controls whether the optional Counts output dataset should be produced, or not.
+
+If you choose to produce this dataset, you can use:
+
+*Name to use for the counts column* to specify the name of the second column in
+that output.
+
+Using, for example, the name of the analyzed sample here can help you keep
+track of your data, especially when joining several counts datasets into a
+count matrix later on. In addition, *Qualimap Counts QC* will reuse the
+names of counts columns as sample names.
+
+**Read selection for counting** section
+
+*Strandedness*
+
+Choose here the option that fits the strand-specificity of your sequencing
+library.
+
+The Galaxy Training Material has an excellent discussion of sequencing
+data strandedness included in the
+`Reference-based RNA-Seq data analysis <https://galaxyproject.github.io/training-material/topics/transcriptomics/tutorials/ref-based/tutorial.html#count-the-number-of-reads-per-annotated-gene>`__
+tutorial.
+
+*Multimapping reads*
+
+Choose here how to treat reads that are mapped ambiguously to several genome locations.
+
+- *Count uniquely mapped reads only* excludes multi-mapping reads
+
+- *Count also multimapping reads* activates *proportional* counting of
+  multi-mapping reads.
+
+  In this mode, each read is weighted according to the number of mapped
+  locations. For example, a read mapped to 4 different locations will add 0.25
+  to the "counts" of each of the locations it maps to. The final calculated
+  counts per feature will be converted to integer numbers.
+
+Note: Detection of multi-mapping reads by the tool relies on the ``NH`` tag of
+reads in the BAM input, so make sure the aligner used to produce the dataset is
+configured to write this tag.
+
+
+Outputs
+=======
+
+HTML Report
+-----------
+
+**Summary Section**
+
+*Reads alignment*
+
+Summarizes the mapping characteristics of the reads in the input:
+
+- total number of mapped reads
+
+  reported as left/right read mates in case of paired-end reads; excludes
+  secondary alignments
+
+  If you accidentally selected `Count fragments` as the *Counting mode* for
+  single-end data these and the following count of *Number of aligned pairs*
+  will be zero.
+
+- total number of alignments
+
+  reports all alignment records found, including secondary alignments
+
+- number of secondary alignments
+
+- number of non-unique alignments
+
+  reports the number of alignment records with an ``NH`` tag greater than one;
+  corresponds to the number of alignments that will have been skipped during
+  counting when *Count uniquely mapped reads only* is selected
+
+- number of reads aligned to genes
+
+- number of ambiguous alignments
+
+  This is the number of mapped reads that span multiple annotated genes.
+  Such reads are always skipped during counting.
+
+- no feature assigned
+
+  reports the number of alignments that are not overlapping any annotated
+  feature; these may represent alignments to introns or intergenic regions, or,
+  if the number is really high, may indicate a problem with your genome
+  annotations
+
+- not aligned
+
+  number of reads not mapped by the aligner (but included in the BAM input)
+
+- strand specificity estimation (fwd/rev)
+
+  computed if *Count reads/fragments independent of strandedness* is selected;
+  estimate of the proportion of alignments in line with forward- and reverse-
+  strand-specificitiy of the sequencing library
+
+  Balanced proportions (*i.e.* ~ 0.5 forward- and ~ 0.5 reverse-strand support)
+  can be interpreted as likely non-strand-specificity of the sequencing library,
+  while a strand-specific library would manifest itself in a large fraction of
+  reads supporting that specific strand-specificity.
+
+*Reads genomic origin*
+
+Lists how many alignments (absolute number/fraction) fall into
+
+- exonic,
+- intronic,
+- intergenic
+
+regions, or are at least
+
+- overlapping an exon.
+
+*Transcript coverage profile*
+
+The profile provides ratios between mean coverage of 5’ regions, 3’ regions and whole transcripts.
+
+- 5’ bias
+
+  the ratio of coverage median of 5’ regions (defined as the first 100 nts) to whole transcripts
+
+- 3' bias
+
+  the ratio of coverage median of 3’ regions (defined as the last 100 nts) to whole transcripts
+
+- 5’-3’ bias
+
+  the ratio of 5' bias to 3' bias.
+
+*Junction analysis*
+
+Lists the total number of reads with splice junctions and the relative
+frequency of the (up to) 10 most frequent junction sequences.
+
+
+**Plots**
+
+*Reads Genomic Origin*
+
+A pie chart showing how many read alignments fall into exonic, intronic and
+intergenic regions.
+
+*Coverage Profile Along Genes (Total)*
+
+This plot shows the mean coverage profile of all genes with non-zero
+overall coverage.
+
+*Coverage Profile Along Genes (Low)*
+
+The plot shows the mean coverage profile of the 500 genes with the lowest, but non-zero overall coverage.
+
+*Coverage Profile Along Genes (High)*
+
+The plot shows the mean coverage profile of the 500 genes with the highest
+overall coverage.
+
+*Coverage Histogram (0-50x)*
+
+Coverage of genes from 0 to 50x. Genes with >50x coverage are added to the 50x
+bin.
+
+*Junction Analysis*
+
+This pie chart shows an analysis of the splice junctions observed in the
+alignments. It consists of three categories:
+
+- Known
+
+  observed splice junctions both sides of which are in line with the genome
+  annotation data
+
+- Partly known
+
+  observed splice junctions for which only one junction side can be deduced
+  from the genome annotation data
+
+- Novel
+
+  observed splice junctions not predicted on either side by the genome
+  annotation data
+
+
+Raw data
+--------
+
+This is a *Collection* of 4 individual datasets.
+
+Of these, the *rnaseq_qc_results* dataset provides a plain-text version of the
+*HTML report* *Summary* section.
+
+The other 3 datasets hold the tabular raw data underlying the three coverage
+profile plots in the *HTML Report*.
+
+
+Counts data
+-----------
+
+Optional. This is a 2-column tabular dataset of read or fragment counts
+(depending on the chosen *Counting mode*) per annotated gene. The first column
+lists the gene identifiers found in the *Genome annotation data*, the second
+the associated counts.
+
+This dataset represents valid (single-sample) input for the QualiMap Counts QC
+tool.
+    ]]></help>
+    <expand macro="citations"/>
+</tool>