Mercurial > repos > iuc > stringtie
diff stringtie.xml @ 15:dd4df992d93d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/stringtie commit a834a41c94d184df80e45ffa2339723826a075b1
author | iuc |
---|---|
date | Tue, 24 Jul 2018 10:23:37 -0400 |
parents | eafd5dc95228 |
children | eba36e001f45 |
line wrap: on
line diff
--- a/stringtie.xml Fri May 04 08:37:37 2018 -0400 +++ b/stringtie.xml Tue Jul 24 10:23:37 2018 -0400 @@ -1,4 +1,4 @@ -<tool id="stringtie" name="StringTie" version="1.3.3.2"> +<tool id="stringtie" name="StringTie" version="@TOOL_VERSION@"> <description>transcript assembly and quantification</description> <macros> <import>macros.xml</import> @@ -95,12 +95,11 @@ ## Replace commas with tabs && sed -i.bak -e "s/,/\${TAB}/g" -e "s/\${CR}//g" gene_counts.csv transcript_counts.csv - #if $guide.special_outputs.keep_header: - && - head -n 1 gene_counts.csv | sed -e 's/sample1/$escaped_element_identifier/' > '$gene_counts' - && - head -n 1 transcript_counts.csv | sed -e 's/sample1/$escaped_element_identifier/' > '$transcript_counts' - #end if + ## Output header + && + head -n 1 gene_counts.csv | sed -e 's/sample1/$escaped_element_identifier/' > '$gene_counts' + && + head -n 1 transcript_counts.csv | sed -e 's/sample1/$escaped_element_identifier/' > '$transcript_counts' ## Sort count files on the first column && tail -n +2 gene_counts.csv | sort -t"\${TAB}" -k1,1 >> '$gene_counts' @@ -145,7 +144,7 @@ <conditional name="special_outputs"> <param name="special_outputs_select" type="select" label="Output files for differential expression?" help="Select to output additional files that can be used with Ballgown or DESeq2/edgeR. See Help section below for more information"> <option value="ballgown">Ballgown</option> - <option value="deseq2">DESeq2/edgeR</option> + <option value="deseq2">DESeq2/edgeR/limma-voom</option> <option value="no" selected="True">No additional output</option> </param> <when value="ballgown" /> @@ -162,7 +161,6 @@ <valid initial="string.letters,string.digits"></valid> </sanitizer> </param> - <param name="keep_header" type="boolean" checked="true" label="Output header line?" help="Keep the header line for edgeR, remove it for DESeq2" /> </when> <when value="no" /> </conditional> @@ -230,13 +228,13 @@ <!--Ensure default GTF output works --> <test expect_num_outputs="1"> <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> - <output name="output_gtf" file="stringtie_out1.gtf" ftype="gtf" lines_diff="2" /> + <output name="output_gtf" file="stringtie_out1.gtf" ftype="gtf" lines_diff="4" /> </test> <!--Ensure fraction option works --> <test expect_num_outputs="1"> <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> <param name="fraction" value="0.17" /> - <output name="output_gtf" file="stringtie_out2.gtf" ftype="gtf" lines_diff="2" /> + <output name="output_gtf" file="stringtie_out2.gtf" ftype="gtf" lines_diff="4" /> </test> <!--Ensure guide option works --> <test expect_num_outputs="1"> @@ -244,7 +242,7 @@ <param name="use_guide" value="yes" /> <param name="guide_gff_select" value="history" /> <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> - <output name="output_gtf" file="stringtie_out3.gtf" ftype="gtf" lines_diff="2" /> + <output name="output_gtf" file="stringtie_out3.gtf" ftype="gtf" lines_diff="4" /> </test> <!--Ensure guide with fraction works --> <test expect_num_outputs="1"> @@ -253,7 +251,7 @@ <param name="guide_gff_select" value="history" /> <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> <param name="fraction" value="0.17" /> - <output name="output_gtf" file="stringtie_out4.gtf" ftype="gtf" lines_diff="2" /> + <output name="output_gtf" file="stringtie_out4.gtf" ftype="gtf" lines_diff="4" /> </test> <!--Ensure coverage and output for Ballgown works --> <test expect_num_outputs="7"> @@ -268,7 +266,7 @@ <output name="transcript_expression" file="./ballgown/t_data.ctab" ftype="tabular" /> <output name="exon_transcript_mapping" file="./ballgown/e2t.ctab" ftype="tabular" /> <output name="intron_transcript_mapping" file="./ballgown/i2t.ctab" ftype="tabular" /> - <output name="output_gtf" file="stringtie_out5.gtf" ftype="gtf" lines_diff="2" /> + <output name="output_gtf" file="stringtie_out5.gtf" ftype="gtf" lines_diff="4" /> <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" /> </test> <!--Ensure output for edgeR works --> @@ -284,24 +282,7 @@ <output name="gene_counts" file="gene_counts_edger.tsv" ftype="tabular" /> <output name="transcript_counts" file="transcript_counts_edger.tsv" ftype="tabular" /> <output name="legend" file="legend.tsv" ftype="tabular" /> - <output name="output_gtf" file="stringtie_out6.gtf" ftype="gtf" lines_diff="2" /> - <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" /> - </test> - <!--Ensure output for DESeq2 works --> - <test expect_num_outputs="5"> - <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> - <param name="use_guide" value="yes" /> - <param name="special_outputs_select" value="deseq2" /> - <param name="keep_header" value="False" /> - <param name="input_estimation" value="True" /> - <param name="guide_gff_select" value="history" /> - <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> - <param name="coverage_file" value="True" /> - <param name="clustering" value="True" /> - <output name="gene_counts" file="gene_counts_deseq2.tsv" ftype="tabular" /> - <output name="transcript_counts" file="transcript_counts_deseq2.tsv" ftype="tabular" /> - <output name="legend" file="legend.tsv" ftype="tabular" /> - <output name="output_gtf" file="stringtie_out6.gtf" ftype="gtf" lines_diff="2" /> + <output name="output_gtf" file="stringtie_out6.gtf" ftype="gtf" lines_diff="4" /> <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" /> </test> <!--Ensure gene abundances output works --> @@ -312,7 +293,7 @@ <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> <param name="fraction" value="0.17" /> <param name="abundance_estimation" value="True" /> - <output name="output_gtf" file="stringtie_out4.gtf" ftype="gtf" lines_diff="2" /> + <output name="output_gtf" file="stringtie_out4.gtf" ftype="gtf" lines_diff="4" /> <output name="gene_abundance_estimation" file="stringtie_out7.gtf" ftype="gtf" lines_diff="2" /> </test> <!--Ensure another fraction value works --> @@ -322,7 +303,7 @@ <param name="guide_gff_select" value="history" /> <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> <param name="fraction" value="0.15" /> - <output name="output_gtf" file="stringtie_out8.gtf" ftype="gtf" lines_diff="2" /> + <output name="output_gtf" file="stringtie_out8.gtf" ftype="gtf" lines_diff="4" /> </test> <!--Ensure built-in GTFs work --> <test expect_num_outputs="1"> @@ -330,7 +311,7 @@ <param name="use_guide" value="yes" /> <param name="guide_gff_select" value="cached" /> <param name="fraction" value="0.15" /> - <output name="output_gtf" file="stringtie_out8.gtf" ftype="gtf" lines_diff="2" /> + <output name="output_gtf" file="stringtie_out8.gtf" ftype="gtf" lines_diff="4" /> </test> </tests> <help><![CDATA[ @@ -339,7 +320,7 @@ **What it does** -StringTie_ is a fast and highly efficient assembler of RNA-Seq alignments into potential transcripts. It uses a novel network flow algorithm as well as an optional *de novo* assembly step to assemble and quantitate full-length transcripts representing multiple splice variants for each gene locus. Its input can include not only the alignments of raw reads used by other transcript assemblers, but also alignments of longer sequences that have been assembled from those reads. In order to identify differentially expressed genes between experiments, StringTie's output can be processed by specialized software like Ballgown_, Cuffdiff_ or other programs (DESeq2_, edgeR_, etc.). +StringTie_ is a fast and highly efficient assembler of RNA-Seq alignments into potential transcripts. It uses a novel network flow algorithm as well as an optional *de novo* assembly step to assemble and quantitate full-length transcripts representing multiple splice variants for each gene locus. Its input can include not only the alignments of raw reads used by other transcript assemblers, but also alignments of longer sequences that have been assembled from those reads. In order to identify differentially expressed genes between experiments, StringTie's output can be processed by specialized software like Ballgown_, Cuffdiff_ or other programs (DESeq2_, edgeR_, limma_ etc.). ----- @@ -370,7 +351,7 @@ If a reference GTF/GFF3 file is used as a guide, StringTie can also output: * a GTF file containing all **fully-covered reference transcripts** in the provided reference file that are covered end-to-end by reads - * Files (tables) for **Ballgown** and/or **DESeq2/edgeR**, which can use them to estimate differential expression + * Files (tables) for **Ballgown** and/or **DESeq2/edgeR/limma-voom**, which can use them to estimate differential expression **StringTie's primary GTF output** @@ -451,12 +432,12 @@ **Ballgown Input Table Files** -An option to output files for Ballgown can be selected under **Output additional files** above. If selected, StringTie will return Ballgown input table files containing coverage data for the reference transcripts given with the -G option. These tables have these specific names: (1) e2t.ctab, (2) e_data.ctab, (3) i2t.ctab, (4) i_data.ctab, and (5) t_data.ctab. A detailed description of each of these five required inputs to Ballgown can be found at `this link`. With this option StringTie can be used as a direct replacement of the tablemaker program included with the Ballgown distribution. +An option to output files for Ballgown can be selected under **Output files for differential expression?** above. If selected, StringTie will return Ballgown input table files containing coverage data for the reference transcripts given with the -G option. These tables have these specific names: (1) e2t.ctab, (2) e_data.ctab, (3) i2t.ctab, (4) i_data.ctab, and (5) t_data.ctab. A detailed description of each of these five required inputs to Ballgown can be found at `this link`. With this option StringTie can be used as a direct replacement of the tablemaker program included with the Ballgown distribution. -**DESeq2/edgeR Input Table Files** +**DESeq2/edgeR/limma-voom Input Table Files** -DESeq2_ and edgeR_ are two popular Bioconductor_ packages for analyzing differential expression, which take as input a matrix of read counts mapped to particular genomic features (e.g., genes). This read count information can be extracted directly from the files generated by StringTie (run with the -e parameter) by selecting DESeq2/edgeR under **Output additional files** above. This uses the StringTie helper script ``prepDE.py`` to convert the GTF output from StringTie into two tab-delimited (TSV) files, containing the count matrices for genes and transcripts, using the coverage values found in the output of StringTie -e. +DESeq2_, edgeR_ and limma_ are three popular Bioconductor_ packages for analyzing differential expression, which take as input a matrix of read counts mapped to particular genomic features (e.g., genes). This read count information can be extracted directly from the files generated by StringTie (run with the -e parameter) by selecting DESeq2/edgeR/limma-voom under **Output files for differential expression?** above. This uses the StringTie helper script ``prepDE.py`` to convert the GTF output from StringTie into two tab-delimited (TSV) files, containing the count matrices for genes and transcripts, using the coverage values found in the output of StringTie -e. ----- @@ -467,7 +448,7 @@ *Differential expression analysis:* -Together with HISAT and Ballgown (or DESeq2/edgeR), StringTie can be used for estimating differential expression across multiple RNA-Seq samples and generating plots and differential expression tables as described in our `protocol paper`_ and shown in a diagram in the `StringTie manual here`_. +Together with HISAT and Ballgown (or DESeq2/edgeR/limma-voom), StringTie can be used for estimating differential expression across multiple RNA-Seq samples and generating plots and differential expression tables as described in our `protocol paper`_ and shown in a diagram in the `StringTie manual here`_. Our recommended workflow includes the following steps: @@ -477,9 +458,9 @@ 3. Run the separate **StringTie merge** tool in order to generate a non-redundant set of transcripts observed in all the RNA-Seq samples assembled previously. ``StringTie merge`` takes as input a list of all the assembled transcripts files (in GTF format) previously obtained for each sample, as well as a reference annotation file (-G option) if available. - 4. For each RNA-Seq sample, run this StringTie tool selecting to output files for Ballgown (or DESeq2/edgeR), which will generate tables of transcript and gene estimated abundances (count files). The option -e (*Use Reference transcripts only*) is not required but is recommended for this run in order to produce more accurate abundance estimations of the input transcripts. Each StringTie run in this step will take as input the sorted read alignments (BAM file) obtained in step 1 for the corresponding sample and the -G option with the merged transcripts (GTF file) generated by ``stringtie merge`` in step 3. Please note that this is the only case where the -G option is not used with a reference annotation, but with the global, merged set of transcripts as observed across all samples. (This step is the equivalent of the *Tablemaker* step described in the original Ballgown pipeline.) + 4. For each RNA-Seq sample, run this StringTie tool selecting to output files for Ballgown (or DESeq2/edgeR/limma-voom), which will generate tables of transcript and gene estimated abundances (count files). The option -e (*Use Reference transcripts only*) is not required but is recommended for this run in order to produce more accurate abundance estimations of the input transcripts. Each StringTie run in this step will take as input the sorted read alignments (BAM file) obtained in step 1 for the corresponding sample and the -G option with the merged transcripts (GTF file) generated by ``stringtie merge`` in step 3. Please note that this is the only case where the -G option is not used with a reference annotation, but with the global, merged set of transcripts as observed across all samples. (This step is the equivalent of the *Tablemaker* step described in the original Ballgown pipeline.) - 5. Ballgown (or DESeq2/edgeR) can now be used to load the coverage tables generated in the previous step and perform various statistical analyses for differential expression, generate plots etc. + 5. Ballgown (or DESeq2/edgeR/limma-voom) can now be used to load the coverage tables generated in the previous step and perform various statistical analyses for differential expression, generate plots etc. An alternate, faster differential expression analysis workflow can be pursued if there is no interest in novel isoforms (i.e. assembled transcripts present in the samples but missing from the reference annotation), or if only a well known set of transcripts of interest are targeted by the analysis. This simplified protocol has only 3 steps (depicted in the `StringTie manual here`_) as it bypasses the individual assembly of each RNA-Seq sample and the "transcript merge" step. This simplified workflow attempts to directly estimate and analyze the expression of a known set of transcripts as given in the reference annotation file. @@ -488,6 +469,7 @@ .. _Cuffdiff: http://cole-trapnell-lab.github.io/cufflinks/cuffdiff/ .. _DESeq2: https://bioconductor.org/packages/release/bioc/html/DESeq2.html .. _edgeR: https://bioconductor.org/packages/release/bioc/html/edgeR.html +.. _limma: https://bioconductor.org/packages/release/bioc/html/limma.html .. _Bioconductor: https://www.bioconductor.org/ .. _SAM: http://samtools.github.io/hts-specs/SAMv1.pdf .. _HISAT2: http://ccb.jhu.edu/software/hisat2