Mercurial > repos > iuc > stringtie
comparison stringtie.xml @ 15:dd4df992d93d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/stringtie commit a834a41c94d184df80e45ffa2339723826a075b1
author | iuc |
---|---|
date | Tue, 24 Jul 2018 10:23:37 -0400 |
parents | eafd5dc95228 |
children | eba36e001f45 |
comparison
equal
deleted
inserted
replaced
14:eafd5dc95228 | 15:dd4df992d93d |
---|---|
1 <tool id="stringtie" name="StringTie" version="1.3.3.2"> | 1 <tool id="stringtie" name="StringTie" version="@TOOL_VERSION@"> |
2 <description>transcript assembly and quantification</description> | 2 <description>transcript assembly and quantification</description> |
3 <macros> | 3 <macros> |
4 <import>macros.xml</import> | 4 <import>macros.xml</import> |
5 </macros> | 5 </macros> |
6 <expand macro="requirements" /> | 6 <expand macro="requirements" /> |
93 #end if | 93 #end if |
94 | 94 |
95 ## Replace commas with tabs | 95 ## Replace commas with tabs |
96 && | 96 && |
97 sed -i.bak -e "s/,/\${TAB}/g" -e "s/\${CR}//g" gene_counts.csv transcript_counts.csv | 97 sed -i.bak -e "s/,/\${TAB}/g" -e "s/\${CR}//g" gene_counts.csv transcript_counts.csv |
98 #if $guide.special_outputs.keep_header: | 98 ## Output header |
99 && | 99 && |
100 head -n 1 gene_counts.csv | sed -e 's/sample1/$escaped_element_identifier/' > '$gene_counts' | 100 head -n 1 gene_counts.csv | sed -e 's/sample1/$escaped_element_identifier/' > '$gene_counts' |
101 && | 101 && |
102 head -n 1 transcript_counts.csv | sed -e 's/sample1/$escaped_element_identifier/' > '$transcript_counts' | 102 head -n 1 transcript_counts.csv | sed -e 's/sample1/$escaped_element_identifier/' > '$transcript_counts' |
103 #end if | |
104 ## Sort count files on the first column | 103 ## Sort count files on the first column |
105 && | 104 && |
106 tail -n +2 gene_counts.csv | sort -t"\${TAB}" -k1,1 >> '$gene_counts' | 105 tail -n +2 gene_counts.csv | sort -t"\${TAB}" -k1,1 >> '$gene_counts' |
107 && | 106 && |
108 tail -n +2 transcript_counts.csv | sort -t"\${TAB}" -k1,1 >> '$transcript_counts' | 107 tail -n +2 transcript_counts.csv | sort -t"\${TAB}" -k1,1 >> '$transcript_counts' |
143 </conditional> | 142 </conditional> |
144 <param name="input_estimation" argument="-e" type="boolean" truevalue="-e" falsevalue="" checked="False" label="Use Reference transcripts only?" help="Limit the processing of read alignments to only estimate and output the assembled transcripts matching the reference transcripts given with the -G option. With this option, read bundles with no reference transcripts (novel transcripts) will be entirely skipped, which may provide a considerable speed boost when the given set of reference transcripts is limited to a set of target genes, for example. Default: No"/> | 143 <param name="input_estimation" argument="-e" type="boolean" truevalue="-e" falsevalue="" checked="False" label="Use Reference transcripts only?" help="Limit the processing of read alignments to only estimate and output the assembled transcripts matching the reference transcripts given with the -G option. With this option, read bundles with no reference transcripts (novel transcripts) will be entirely skipped, which may provide a considerable speed boost when the given set of reference transcripts is limited to a set of target genes, for example. Default: No"/> |
145 <conditional name="special_outputs"> | 144 <conditional name="special_outputs"> |
146 <param name="special_outputs_select" type="select" label="Output files for differential expression?" help="Select to output additional files that can be used with Ballgown or DESeq2/edgeR. See Help section below for more information"> | 145 <param name="special_outputs_select" type="select" label="Output files for differential expression?" help="Select to output additional files that can be used with Ballgown or DESeq2/edgeR. See Help section below for more information"> |
147 <option value="ballgown">Ballgown</option> | 146 <option value="ballgown">Ballgown</option> |
148 <option value="deseq2">DESeq2/edgeR</option> | 147 <option value="deseq2">DESeq2/edgeR/limma-voom</option> |
149 <option value="no" selected="True">No additional output</option> | 148 <option value="no" selected="True">No additional output</option> |
150 </param> | 149 </param> |
151 <when value="ballgown" /> | 150 <when value="ballgown" /> |
152 <when value="deseq2"> | 151 <when value="deseq2"> |
153 <param name="read_length" argument="--length" type="integer" min="0" value="75" label="Specify the average read length" help="Default: 75" /> | 152 <param name="read_length" argument="--length" type="integer" min="0" value="75" label="Specify the average read length" help="Default: 75" /> |
160 <param argument="--key" type="text" label="Prefix for clustering" help="If clustering, what prefix to use for geneIDs assigned by this script. Only letters and numbers will be retained in this field. Default: prepG"> | 159 <param argument="--key" type="text" label="Prefix for clustering" help="If clustering, what prefix to use for geneIDs assigned by this script. Only letters and numbers will be retained in this field. Default: prepG"> |
161 <sanitizer> | 160 <sanitizer> |
162 <valid initial="string.letters,string.digits"></valid> | 161 <valid initial="string.letters,string.digits"></valid> |
163 </sanitizer> | 162 </sanitizer> |
164 </param> | 163 </param> |
165 <param name="keep_header" type="boolean" checked="true" label="Output header line?" help="Keep the header line for edgeR, remove it for DESeq2" /> | |
166 </when> | 164 </when> |
167 <when value="no" /> | 165 <when value="no" /> |
168 </conditional> | 166 </conditional> |
169 <param name="coverage_file" argument="-C" type="boolean" truevalue="-C" falsevalue="" checked="False" label="Output coverage file?" help="If StringTie is run with this option (requires -G), it returns a file with all the transcripts in the reference annotation that are fully covered, end to end, by reads. The output format is a GTF file as described below. Each line of the GTF is corresponds to a gene or transcript in the reference annotation. Default: No"/> | 167 <param name="coverage_file" argument="-C" type="boolean" truevalue="-C" falsevalue="" checked="False" label="Output coverage file?" help="If StringTie is run with this option (requires -G), it returns a file with all the transcripts in the reference annotation that are fully covered, end to end, by reads. The output format is a GTF file as described below. Each line of the GTF is corresponds to a gene or transcript in the reference annotation. Default: No"/> |
170 </when> | 168 </when> |
228 </outputs> | 226 </outputs> |
229 <tests> | 227 <tests> |
230 <!--Ensure default GTF output works --> | 228 <!--Ensure default GTF output works --> |
231 <test expect_num_outputs="1"> | 229 <test expect_num_outputs="1"> |
232 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> | 230 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> |
233 <output name="output_gtf" file="stringtie_out1.gtf" ftype="gtf" lines_diff="2" /> | 231 <output name="output_gtf" file="stringtie_out1.gtf" ftype="gtf" lines_diff="4" /> |
234 </test> | 232 </test> |
235 <!--Ensure fraction option works --> | 233 <!--Ensure fraction option works --> |
236 <test expect_num_outputs="1"> | 234 <test expect_num_outputs="1"> |
237 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> | 235 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> |
238 <param name="fraction" value="0.17" /> | 236 <param name="fraction" value="0.17" /> |
239 <output name="output_gtf" file="stringtie_out2.gtf" ftype="gtf" lines_diff="2" /> | 237 <output name="output_gtf" file="stringtie_out2.gtf" ftype="gtf" lines_diff="4" /> |
240 </test> | 238 </test> |
241 <!--Ensure guide option works --> | 239 <!--Ensure guide option works --> |
242 <test expect_num_outputs="1"> | 240 <test expect_num_outputs="1"> |
243 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> | 241 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> |
244 <param name="use_guide" value="yes" /> | 242 <param name="use_guide" value="yes" /> |
245 <param name="guide_gff_select" value="history" /> | 243 <param name="guide_gff_select" value="history" /> |
246 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> | 244 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> |
247 <output name="output_gtf" file="stringtie_out3.gtf" ftype="gtf" lines_diff="2" /> | 245 <output name="output_gtf" file="stringtie_out3.gtf" ftype="gtf" lines_diff="4" /> |
248 </test> | 246 </test> |
249 <!--Ensure guide with fraction works --> | 247 <!--Ensure guide with fraction works --> |
250 <test expect_num_outputs="1"> | 248 <test expect_num_outputs="1"> |
251 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> | 249 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> |
252 <param name="use_guide" value="yes" /> | 250 <param name="use_guide" value="yes" /> |
253 <param name="guide_gff_select" value="history" /> | 251 <param name="guide_gff_select" value="history" /> |
254 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> | 252 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> |
255 <param name="fraction" value="0.17" /> | 253 <param name="fraction" value="0.17" /> |
256 <output name="output_gtf" file="stringtie_out4.gtf" ftype="gtf" lines_diff="2" /> | 254 <output name="output_gtf" file="stringtie_out4.gtf" ftype="gtf" lines_diff="4" /> |
257 </test> | 255 </test> |
258 <!--Ensure coverage and output for Ballgown works --> | 256 <!--Ensure coverage and output for Ballgown works --> |
259 <test expect_num_outputs="7"> | 257 <test expect_num_outputs="7"> |
260 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> | 258 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> |
261 <param name="use_guide" value="yes" /> | 259 <param name="use_guide" value="yes" /> |
266 <output name="exon_expression" file="./ballgown/e_data.ctab" ftype="tabular" /> | 264 <output name="exon_expression" file="./ballgown/e_data.ctab" ftype="tabular" /> |
267 <output name="intron_expression" file="./ballgown/i_data.ctab" ftype="tabular" /> | 265 <output name="intron_expression" file="./ballgown/i_data.ctab" ftype="tabular" /> |
268 <output name="transcript_expression" file="./ballgown/t_data.ctab" ftype="tabular" /> | 266 <output name="transcript_expression" file="./ballgown/t_data.ctab" ftype="tabular" /> |
269 <output name="exon_transcript_mapping" file="./ballgown/e2t.ctab" ftype="tabular" /> | 267 <output name="exon_transcript_mapping" file="./ballgown/e2t.ctab" ftype="tabular" /> |
270 <output name="intron_transcript_mapping" file="./ballgown/i2t.ctab" ftype="tabular" /> | 268 <output name="intron_transcript_mapping" file="./ballgown/i2t.ctab" ftype="tabular" /> |
271 <output name="output_gtf" file="stringtie_out5.gtf" ftype="gtf" lines_diff="2" /> | 269 <output name="output_gtf" file="stringtie_out5.gtf" ftype="gtf" lines_diff="4" /> |
272 <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" /> | 270 <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" /> |
273 </test> | 271 </test> |
274 <!--Ensure output for edgeR works --> | 272 <!--Ensure output for edgeR works --> |
275 <test expect_num_outputs="5"> | 273 <test expect_num_outputs="5"> |
276 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> | 274 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> |
282 <param name="coverage_file" value="True" /> | 280 <param name="coverage_file" value="True" /> |
283 <param name="clustering" value="True" /> | 281 <param name="clustering" value="True" /> |
284 <output name="gene_counts" file="gene_counts_edger.tsv" ftype="tabular" /> | 282 <output name="gene_counts" file="gene_counts_edger.tsv" ftype="tabular" /> |
285 <output name="transcript_counts" file="transcript_counts_edger.tsv" ftype="tabular" /> | 283 <output name="transcript_counts" file="transcript_counts_edger.tsv" ftype="tabular" /> |
286 <output name="legend" file="legend.tsv" ftype="tabular" /> | 284 <output name="legend" file="legend.tsv" ftype="tabular" /> |
287 <output name="output_gtf" file="stringtie_out6.gtf" ftype="gtf" lines_diff="2" /> | 285 <output name="output_gtf" file="stringtie_out6.gtf" ftype="gtf" lines_diff="4" /> |
288 <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" /> | |
289 </test> | |
290 <!--Ensure output for DESeq2 works --> | |
291 <test expect_num_outputs="5"> | |
292 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> | |
293 <param name="use_guide" value="yes" /> | |
294 <param name="special_outputs_select" value="deseq2" /> | |
295 <param name="keep_header" value="False" /> | |
296 <param name="input_estimation" value="True" /> | |
297 <param name="guide_gff_select" value="history" /> | |
298 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> | |
299 <param name="coverage_file" value="True" /> | |
300 <param name="clustering" value="True" /> | |
301 <output name="gene_counts" file="gene_counts_deseq2.tsv" ftype="tabular" /> | |
302 <output name="transcript_counts" file="transcript_counts_deseq2.tsv" ftype="tabular" /> | |
303 <output name="legend" file="legend.tsv" ftype="tabular" /> | |
304 <output name="output_gtf" file="stringtie_out6.gtf" ftype="gtf" lines_diff="2" /> | |
305 <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" /> | 286 <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" /> |
306 </test> | 287 </test> |
307 <!--Ensure gene abundances output works --> | 288 <!--Ensure gene abundances output works --> |
308 <test expect_num_outputs="2"> | 289 <test expect_num_outputs="2"> |
309 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> | 290 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> |
310 <param name="use_guide" value="yes" /> | 291 <param name="use_guide" value="yes" /> |
311 <param name="guide_gff_select" value="history" /> | 292 <param name="guide_gff_select" value="history" /> |
312 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> | 293 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> |
313 <param name="fraction" value="0.17" /> | 294 <param name="fraction" value="0.17" /> |
314 <param name="abundance_estimation" value="True" /> | 295 <param name="abundance_estimation" value="True" /> |
315 <output name="output_gtf" file="stringtie_out4.gtf" ftype="gtf" lines_diff="2" /> | 296 <output name="output_gtf" file="stringtie_out4.gtf" ftype="gtf" lines_diff="4" /> |
316 <output name="gene_abundance_estimation" file="stringtie_out7.gtf" ftype="gtf" lines_diff="2" /> | 297 <output name="gene_abundance_estimation" file="stringtie_out7.gtf" ftype="gtf" lines_diff="2" /> |
317 </test> | 298 </test> |
318 <!--Ensure another fraction value works --> | 299 <!--Ensure another fraction value works --> |
319 <test expect_num_outputs="1"> | 300 <test expect_num_outputs="1"> |
320 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> | 301 <param name="input_bam" ftype="bam" value="stringtie_in1.bam" /> |
321 <param name="use_guide" value="yes" /> | 302 <param name="use_guide" value="yes" /> |
322 <param name="guide_gff_select" value="history" /> | 303 <param name="guide_gff_select" value="history" /> |
323 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> | 304 <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" /> |
324 <param name="fraction" value="0.15" /> | 305 <param name="fraction" value="0.15" /> |
325 <output name="output_gtf" file="stringtie_out8.gtf" ftype="gtf" lines_diff="2" /> | 306 <output name="output_gtf" file="stringtie_out8.gtf" ftype="gtf" lines_diff="4" /> |
326 </test> | 307 </test> |
327 <!--Ensure built-in GTFs work --> | 308 <!--Ensure built-in GTFs work --> |
328 <test expect_num_outputs="1"> | 309 <test expect_num_outputs="1"> |
329 <param name="input_bam" ftype="bam" dbkey="hg38" value="stringtie_in1.bam" /> | 310 <param name="input_bam" ftype="bam" dbkey="hg38" value="stringtie_in1.bam" /> |
330 <param name="use_guide" value="yes" /> | 311 <param name="use_guide" value="yes" /> |
331 <param name="guide_gff_select" value="cached" /> | 312 <param name="guide_gff_select" value="cached" /> |
332 <param name="fraction" value="0.15" /> | 313 <param name="fraction" value="0.15" /> |
333 <output name="output_gtf" file="stringtie_out8.gtf" ftype="gtf" lines_diff="2" /> | 314 <output name="output_gtf" file="stringtie_out8.gtf" ftype="gtf" lines_diff="4" /> |
334 </test> | 315 </test> |
335 </tests> | 316 </tests> |
336 <help><![CDATA[ | 317 <help><![CDATA[ |
337 | 318 |
338 .. class:: infomark | 319 .. class:: infomark |
339 | 320 |
340 **What it does** | 321 **What it does** |
341 | 322 |
342 StringTie_ is a fast and highly efficient assembler of RNA-Seq alignments into potential transcripts. It uses a novel network flow algorithm as well as an optional *de novo* assembly step to assemble and quantitate full-length transcripts representing multiple splice variants for each gene locus. Its input can include not only the alignments of raw reads used by other transcript assemblers, but also alignments of longer sequences that have been assembled from those reads. In order to identify differentially expressed genes between experiments, StringTie's output can be processed by specialized software like Ballgown_, Cuffdiff_ or other programs (DESeq2_, edgeR_, etc.). | 323 StringTie_ is a fast and highly efficient assembler of RNA-Seq alignments into potential transcripts. It uses a novel network flow algorithm as well as an optional *de novo* assembly step to assemble and quantitate full-length transcripts representing multiple splice variants for each gene locus. Its input can include not only the alignments of raw reads used by other transcript assemblers, but also alignments of longer sequences that have been assembled from those reads. In order to identify differentially expressed genes between experiments, StringTie's output can be processed by specialized software like Ballgown_, Cuffdiff_ or other programs (DESeq2_, edgeR_, limma_ etc.). |
343 | 324 |
344 ----- | 325 ----- |
345 | 326 |
346 **Inputs** | 327 **Inputs** |
347 | 328 |
368 * a TSV (tab-delimited) file of **Gene abundances** | 349 * a TSV (tab-delimited) file of **Gene abundances** |
369 | 350 |
370 If a reference GTF/GFF3 file is used as a guide, StringTie can also output: | 351 If a reference GTF/GFF3 file is used as a guide, StringTie can also output: |
371 | 352 |
372 * a GTF file containing all **fully-covered reference transcripts** in the provided reference file that are covered end-to-end by reads | 353 * a GTF file containing all **fully-covered reference transcripts** in the provided reference file that are covered end-to-end by reads |
373 * Files (tables) for **Ballgown** and/or **DESeq2/edgeR**, which can use them to estimate differential expression | 354 * Files (tables) for **Ballgown** and/or **DESeq2/edgeR/limma-voom**, which can use them to estimate differential expression |
374 | 355 |
375 | 356 |
376 **StringTie's primary GTF output** | 357 **StringTie's primary GTF output** |
377 | 358 |
378 The primary output of StringTie is a Gene Transfer Format (GTF) file that contains details of the transcripts that StringTie assembles from RNA-Seq data. GTF is an extension of GFF (Gene Finding Format, also called General Feature Format), and is very similar to GFF2 and GFF3. The field definitions for the 9 columns of GTF output can be found at the `Ensembl site here`_. The following is an example of a transcript assembled by StringTie as shown in a GTF file: | 359 The primary output of StringTie is a Gene Transfer Format (GTF) file that contains details of the transcripts that StringTie assembles from RNA-Seq data. GTF is an extension of GFF (Gene Finding Format, also called General Feature Format), and is very similar to GFF2 and GFF3. The field definitions for the 9 columns of GTF output can be found at the `Ensembl site here`_. The following is an example of a transcript assembled by StringTie as shown in a GTF file: |
449 | 430 |
450 If StringTie is run with the use reference guide option (-G), it will also return a file with all the transcripts in the reference annotation that are fully covered, end to end, by reads. The output format is a GTF file as described above. Each line of the GTF is corresponds to a gene or transcript in the reference annotation. | 431 If StringTie is run with the use reference guide option (-G), it will also return a file with all the transcripts in the reference annotation that are fully covered, end to end, by reads. The output format is a GTF file as described above. Each line of the GTF is corresponds to a gene or transcript in the reference annotation. |
451 | 432 |
452 **Ballgown Input Table Files** | 433 **Ballgown Input Table Files** |
453 | 434 |
454 An option to output files for Ballgown can be selected under **Output additional files** above. If selected, StringTie will return Ballgown input table files containing coverage data for the reference transcripts given with the -G option. These tables have these specific names: (1) e2t.ctab, (2) e_data.ctab, (3) i2t.ctab, (4) i_data.ctab, and (5) t_data.ctab. A detailed description of each of these five required inputs to Ballgown can be found at `this link`. With this option StringTie can be used as a direct replacement of the tablemaker program included with the Ballgown distribution. | 435 An option to output files for Ballgown can be selected under **Output files for differential expression?** above. If selected, StringTie will return Ballgown input table files containing coverage data for the reference transcripts given with the -G option. These tables have these specific names: (1) e2t.ctab, (2) e_data.ctab, (3) i2t.ctab, (4) i_data.ctab, and (5) t_data.ctab. A detailed description of each of these five required inputs to Ballgown can be found at `this link`. With this option StringTie can be used as a direct replacement of the tablemaker program included with the Ballgown distribution. |
455 | 436 |
456 | 437 |
457 **DESeq2/edgeR Input Table Files** | 438 **DESeq2/edgeR/limma-voom Input Table Files** |
458 | 439 |
459 DESeq2_ and edgeR_ are two popular Bioconductor_ packages for analyzing differential expression, which take as input a matrix of read counts mapped to particular genomic features (e.g., genes). This read count information can be extracted directly from the files generated by StringTie (run with the -e parameter) by selecting DESeq2/edgeR under **Output additional files** above. This uses the StringTie helper script ``prepDE.py`` to convert the GTF output from StringTie into two tab-delimited (TSV) files, containing the count matrices for genes and transcripts, using the coverage values found in the output of StringTie -e. | 440 DESeq2_, edgeR_ and limma_ are three popular Bioconductor_ packages for analyzing differential expression, which take as input a matrix of read counts mapped to particular genomic features (e.g., genes). This read count information can be extracted directly from the files generated by StringTie (run with the -e parameter) by selecting DESeq2/edgeR/limma-voom under **Output files for differential expression?** above. This uses the StringTie helper script ``prepDE.py`` to convert the GTF output from StringTie into two tab-delimited (TSV) files, containing the count matrices for genes and transcripts, using the coverage values found in the output of StringTie -e. |
460 | 441 |
461 ----- | 442 ----- |
462 | 443 |
463 **More Information** | 444 **More Information** |
464 | 445 |
465 *Evaluating transcript assemblies:* | 446 *Evaluating transcript assemblies:* |
466 A simple way of getting more information about the transcripts assembled by StringTie (summary of gene and transcript counts, novel vs. known etc.), or even performing basic tracking of assembled isoforms across multiple RNA-Seq experiments, is to use the **gffcompare** program. Basic usage information for this program can be found on the `GFF utilities page`_. | 447 A simple way of getting more information about the transcripts assembled by StringTie (summary of gene and transcript counts, novel vs. known etc.), or even performing basic tracking of assembled isoforms across multiple RNA-Seq experiments, is to use the **gffcompare** program. Basic usage information for this program can be found on the `GFF utilities page`_. |
467 | 448 |
468 *Differential expression analysis:* | 449 *Differential expression analysis:* |
469 | 450 |
470 Together with HISAT and Ballgown (or DESeq2/edgeR), StringTie can be used for estimating differential expression across multiple RNA-Seq samples and generating plots and differential expression tables as described in our `protocol paper`_ and shown in a diagram in the `StringTie manual here`_. | 451 Together with HISAT and Ballgown (or DESeq2/edgeR/limma-voom), StringTie can be used for estimating differential expression across multiple RNA-Seq samples and generating plots and differential expression tables as described in our `protocol paper`_ and shown in a diagram in the `StringTie manual here`_. |
471 | 452 |
472 Our recommended workflow includes the following steps: | 453 Our recommended workflow includes the following steps: |
473 | 454 |
474 1. For each RNA-Seq sample, map the reads to the genome with HISAT2 using the --dta option. It is highly recommended to use the reference annotation information when mapping the reads, which can be either embedded in the genome index (built with the --ss and --exon options, see HISAT2 manual), or provided separately at run time (using the --known-splicesite-infile option of HISAT2). The SAM output of each HISAT2 run must be sorted and converted to BAM using samtools as explained above. | 455 1. For each RNA-Seq sample, map the reads to the genome with HISAT2 using the --dta option. It is highly recommended to use the reference annotation information when mapping the reads, which can be either embedded in the genome index (built with the --ss and --exon options, see HISAT2 manual), or provided separately at run time (using the --known-splicesite-infile option of HISAT2). The SAM output of each HISAT2 run must be sorted and converted to BAM using samtools as explained above. |
475 | 456 |
476 2. For each RNA-Seq sample, use this StringTie tool to assemble the read alignments obtained in the previous step; it is recommended to run StringTie with the -G option if the reference annotation is available. | 457 2. For each RNA-Seq sample, use this StringTie tool to assemble the read alignments obtained in the previous step; it is recommended to run StringTie with the -G option if the reference annotation is available. |
477 | 458 |
478 3. Run the separate **StringTie merge** tool in order to generate a non-redundant set of transcripts observed in all the RNA-Seq samples assembled previously. ``StringTie merge`` takes as input a list of all the assembled transcripts files (in GTF format) previously obtained for each sample, as well as a reference annotation file (-G option) if available. | 459 3. Run the separate **StringTie merge** tool in order to generate a non-redundant set of transcripts observed in all the RNA-Seq samples assembled previously. ``StringTie merge`` takes as input a list of all the assembled transcripts files (in GTF format) previously obtained for each sample, as well as a reference annotation file (-G option) if available. |
479 | 460 |
480 4. For each RNA-Seq sample, run this StringTie tool selecting to output files for Ballgown (or DESeq2/edgeR), which will generate tables of transcript and gene estimated abundances (count files). The option -e (*Use Reference transcripts only*) is not required but is recommended for this run in order to produce more accurate abundance estimations of the input transcripts. Each StringTie run in this step will take as input the sorted read alignments (BAM file) obtained in step 1 for the corresponding sample and the -G option with the merged transcripts (GTF file) generated by ``stringtie merge`` in step 3. Please note that this is the only case where the -G option is not used with a reference annotation, but with the global, merged set of transcripts as observed across all samples. (This step is the equivalent of the *Tablemaker* step described in the original Ballgown pipeline.) | 461 4. For each RNA-Seq sample, run this StringTie tool selecting to output files for Ballgown (or DESeq2/edgeR/limma-voom), which will generate tables of transcript and gene estimated abundances (count files). The option -e (*Use Reference transcripts only*) is not required but is recommended for this run in order to produce more accurate abundance estimations of the input transcripts. Each StringTie run in this step will take as input the sorted read alignments (BAM file) obtained in step 1 for the corresponding sample and the -G option with the merged transcripts (GTF file) generated by ``stringtie merge`` in step 3. Please note that this is the only case where the -G option is not used with a reference annotation, but with the global, merged set of transcripts as observed across all samples. (This step is the equivalent of the *Tablemaker* step described in the original Ballgown pipeline.) |
481 | 462 |
482 5. Ballgown (or DESeq2/edgeR) can now be used to load the coverage tables generated in the previous step and perform various statistical analyses for differential expression, generate plots etc. | 463 5. Ballgown (or DESeq2/edgeR/limma-voom) can now be used to load the coverage tables generated in the previous step and perform various statistical analyses for differential expression, generate plots etc. |
483 | 464 |
484 An alternate, faster differential expression analysis workflow can be pursued if there is no interest in novel isoforms (i.e. assembled transcripts present in the samples but missing from the reference annotation), or if only a well known set of transcripts of interest are targeted by the analysis. This simplified protocol has only 3 steps (depicted in the `StringTie manual here`_) as it bypasses the individual assembly of each RNA-Seq sample and the "transcript merge" step. This simplified workflow attempts to directly estimate and analyze the expression of a known set of transcripts as given in the reference annotation file. | 465 An alternate, faster differential expression analysis workflow can be pursued if there is no interest in novel isoforms (i.e. assembled transcripts present in the samples but missing from the reference annotation), or if only a well known set of transcripts of interest are targeted by the analysis. This simplified protocol has only 3 steps (depicted in the `StringTie manual here`_) as it bypasses the individual assembly of each RNA-Seq sample and the "transcript merge" step. This simplified workflow attempts to directly estimate and analyze the expression of a known set of transcripts as given in the reference annotation file. |
485 | 466 |
486 .. _StringTie: http://ccb.jhu.edu/software/stringtie/ | 467 .. _StringTie: http://ccb.jhu.edu/software/stringtie/ |
487 .. _Ballgown: https://www.biorxiv.org/content/early/2014/09/05/003665 | 468 .. _Ballgown: https://www.biorxiv.org/content/early/2014/09/05/003665 |
488 .. _Cuffdiff: http://cole-trapnell-lab.github.io/cufflinks/cuffdiff/ | 469 .. _Cuffdiff: http://cole-trapnell-lab.github.io/cufflinks/cuffdiff/ |
489 .. _DESeq2: https://bioconductor.org/packages/release/bioc/html/DESeq2.html | 470 .. _DESeq2: https://bioconductor.org/packages/release/bioc/html/DESeq2.html |
490 .. _edgeR: https://bioconductor.org/packages/release/bioc/html/edgeR.html | 471 .. _edgeR: https://bioconductor.org/packages/release/bioc/html/edgeR.html |
472 .. _limma: https://bioconductor.org/packages/release/bioc/html/limma.html | |
491 .. _Bioconductor: https://www.bioconductor.org/ | 473 .. _Bioconductor: https://www.bioconductor.org/ |
492 .. _SAM: http://samtools.github.io/hts-specs/SAMv1.pdf | 474 .. _SAM: http://samtools.github.io/hts-specs/SAMv1.pdf |
493 .. _HISAT2: http://ccb.jhu.edu/software/hisat2 | 475 .. _HISAT2: http://ccb.jhu.edu/software/hisat2 |
494 .. _`GTF/GFF3`: https://ccb.jhu.edu/software/stringtie/gff.shtml | 476 .. _`GTF/GFF3`: https://ccb.jhu.edu/software/stringtie/gff.shtml |
495 .. _`this link`: https://github.com/alyssafrazee/ballgown#ballgown-readable-expression-output | 477 .. _`this link`: https://github.com/alyssafrazee/ballgown#ballgown-readable-expression-output |