Mercurial > repos > iuc > stringtie

--- a/stringtie.xml	Thu Nov 09 11:17:32 2017 -0500
+++ b/stringtie.xml	Thu Apr 12 17:30:07 2018 -0400
@@ -1,4 +1,4 @@
-<tool id="stringtie" name="StringTie" version="1.3.3.1">
+<tool id="stringtie" name="StringTie" version="1.3.3.2">
     <description>transcript assembly and quantification</description>
     <macros>
         <import>macros.xml</import>
@@ -7,6 +7,7 @@
     <expand macro="stdio" />
     <expand macro="version_command" />
     <command><![CDATA[
+#import re
 mkdir -p ./special_de_output/sample1/ &&

 ## Get Guide GTF/GFF if selected
@@ -62,40 +63,49 @@

 #if str($guide.use_guide) == 'yes':
     #if $guide.special_outputs.special_outputs_select == 'deseq2':
+        #set escaped_element_identifier = re.sub('[^\w\-]', '_', str($input_bam.element_identifier))
         &&
         ln -s '$output_gtf' ./special_de_output/sample1/output.gtf
         &&
+        TAB=\$(printf '\t')
+        &&
+        CR=\$(printf '\r')
+        &&
         prepDE.py
-            -i ./special_de_output/
-            -g '$gene_counts'
-            -t '$transcript_counts'
-            -l $guide.special_outputs.read_length
-            #if $guide.special_outputs.string:
-                -s '$guide.special_outputs.string'
+        -i ./special_de_output/
+        -g gene_counts.csv
+        -t transcript_counts.csv
+        -l $guide.special_outputs.read_length
+        #if $guide.special_outputs.string:
+            -s '$guide.special_outputs.string'
+        #end if
+        #if $guide.special_outputs.clustering:
+            -c
+            #if $guide.special_outputs.key:
+                -k '$guide.special_outputs.key'
             #end if
-            #if $guide.special_outputs.clustering:
-                -c
-                #if $guide.special_outputs.key:
-                    -k '$guide.special_outputs.key'
-                #end if
-                --legend '$legend'
-                > /dev/null
-                &&
-                sed -i.bak 's/,/\t/g' '$legend'
-                &&
-                sed -i.bak 's/\r//g' '$legend'
-            #end if
+            --legend '$legend'
+            > /dev/null
+            &&
+            sed -i.bak -e "s/,/\${TAB}/g" -e "s/\${CR}//g" '$legend'
+        #else
+            > /dev/null
+        #end if

-        > /dev/null
-
-        &&
-        sed -i.bak 's/,/\t/g' '$transcript_counts'
+        ## Replace commas with tabs
         &&
-        sed -i.bak 's/\r//g' '$transcript_counts'
+        sed -i.bak -e "s/,/\${TAB}/g" -e "s/\${CR}//g" gene_counts.csv transcript_counts.csv
+        #if $guide.special_outputs.keep_header:
+            &&
+            head -n 1 gene_counts.csv | sed -e 's/sample1/$escaped_element_identifier/' > '$gene_counts'
+            &&
+            head -n 1 transcript_counts.csv | sed -e 's/sample1/$escaped_element_identifier/' > '$transcript_counts'
+        #end if
+        ## Sort count files on the first column
         &&
-        sed -i.bak 's/,/\t/g' '$gene_counts'
+        tail -n +2 gene_counts.csv | sort -t"\${TAB}" -k1 >> '$gene_counts'
         &&
-        sed -i.bak 's/\r//g' '$gene_counts'
+        tail -n +2 transcript_counts.csv | sort -t"\${TAB}" -k1 >> '$transcript_counts'
     #end if
 #end if
     ]]></command>
@@ -141,7 +151,7 @@
                     <when value="ballgown" />
                     <when value="deseq2">
                         <param name="read_length" argument="--length" type="integer" min="0" value="75" label="Specify the average read length" help="Default: 75" />
-                        <param name="clustering" argument="--cluster" type="boolean" truevalue="--cluster" falsevalue="" checked="False" label="Cluster overlapping genes" help="Choose whether to cluster genes with different gene IDs that overlap. Transcripts containing the geneID prefix will be ignored. Default: No" />
+                        <param name="clustering" argument="--cluster" type="boolean" truevalue="--cluster" falsevalue="" checked="false" label="Cluster overlapping genes" help="Choose whether to cluster genes with different gene IDs that overlap. Transcripts containing the geneID prefix will be ignored. Default: No" />
                         <param argument="--string" type="text" label="Prefix used for transcripts" help="If a different prefix was used for geneIDs assigned by StringTie than the default, specify it here. Only letters and numbers will be retained in this field. Default: MSTRG" >
                             <sanitizer>
                                 <valid initial="string.letters,string.digits"></valid>
@@ -152,10 +162,11 @@
                                 <valid initial="string.letters,string.digits"></valid>
                             </sanitizer>
                         </param>
+                        <param name="keep_header" type="boolean" checked="true" label="Output header line?" help="Keep the header line for edgeR, remove it for DESeq2" />
                     </when>
                     <when value="no" />
                 </conditional>
-                 <param name="coverage_file" argument="-C" type="boolean" truevalue="-C" falsevalue="" checked="False" label="Output coverage file?" help="If StringTie is run with this option (requires -G), it returns a file with all the transcripts in the reference annotation that are fully covered, end to end, by reads. The output format is a GTF file as described below. Each line of the GTF is corresponds to a gene or transcript in the reference annotation. Default: No"/>
+                <param name="coverage_file" argument="-C" type="boolean" truevalue="-C" falsevalue="" checked="False" label="Output coverage file?" help="If StringTie is run with this option (requires -G), it returns a file with all the transcripts in the reference annotation that are fully covered, end to end, by reads. The output format is a GTF file as described below. Each line of the GTF is corresponds to a gene or transcript in the reference annotation. Default: No"/>
             </when>
         </conditional>
         <section name="adv" title="Advanced Options">
@@ -260,7 +271,7 @@
             <output name="output_gtf" file="stringtie_out5.gtf" ftype="gtf" lines_diff="2" />
             <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" />
         </test>
-        <!--Ensure output for DESeq2/edgeR works -->
+        <!--Ensure output for edgeR works -->
         <test expect_num_outputs="5">
             <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
             <param name="use_guide" value="yes" />
@@ -270,9 +281,26 @@
             <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" />
             <param name="coverage_file" value="True" />
             <param name="clustering" value="True" />
-            <output name="gene_counts" file="./deseq2/gene_counts.tsv" ftype="tabular" />
-            <output name="transcript_counts" file="./deseq2/transcript_counts.tsv" ftype="tabular" />
-            <output name="legend" file="./deseq2/legend.tsv" ftype="tabular" />
+            <output name="gene_counts" file="gene_counts_edger.tsv" ftype="tabular" />
+            <output name="transcript_counts" file="transcript_counts_edger.tsv" ftype="tabular" />
+            <output name="legend" file="legend.tsv" ftype="tabular" />
+            <output name="output_gtf" file="stringtie_out6.gtf" ftype="gtf" lines_diff="2" />
+            <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" />
+        </test>
+        <!--Ensure output for DESeq2 works -->
+        <test expect_num_outputs="5">
+            <param name="input_bam" ftype="bam" value="stringtie_in1.bam" />
+            <param name="use_guide" value="yes" />
+            <param name="special_outputs_select" value="deseq2" />
+            <param name="keep_header" value="False" />
+            <param name="input_estimation" value="True" />
+            <param name="guide_gff_select" value="history" />
+            <param name="ref_hist" ftype="gtf" value="stringtie_in.gtf" />
+            <param name="coverage_file" value="True" />
+            <param name="clustering" value="True" />
+            <output name="gene_counts" file="gene_counts_deseq2.tsv" ftype="tabular" />
+            <output name="transcript_counts" file="transcript_counts_deseq2.tsv" ftype="tabular" />
+            <output name="legend" file="legend.tsv" ftype="tabular" />
             <output name="output_gtf" file="stringtie_out6.gtf" ftype="gtf" lines_diff="2" />
             <output name="coverage" file="stringtie_out_coverage.gtf" ftype="gtf" />
         </test>
@@ -474,4 +502,4 @@

     ]]></help>
     <expand macro="citations" />
-</tool>
\ No newline at end of file
+</tool>
--- a/test-data/deseq2/gene_counts.tsv	Thu Nov 09 11:17:32 2017 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-gene_id	sample1
-CUFF.1	182
--- a/test-data/deseq2/transcript_counts.tsv	Thu Nov 09 11:17:32 2017 -0500
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,2 +0,0 @@
-transcript_id	sample1
-CUFF.1.1	182
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gene_counts_deseq2.tsv	Thu Apr 12 17:30:07 2018 -0400
@@ -0,0 +1,1 @@
+CUFF.1	182
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/gene_counts_edger.tsv	Thu Apr 12 17:30:07 2018 -0400
@@ -0,0 +1,2 @@
+gene_id	stringtie_in1_bam
+CUFF.1	182
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/transcript_counts_deseq2.tsv	Thu Apr 12 17:30:07 2018 -0400
@@ -0,0 +1,1 @@
+CUFF.1.1	182
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/transcript_counts_edger.tsv	Thu Apr 12 17:30:07 2018 -0400
@@ -0,0 +1,2 @@
+transcript_id	stringtie_in1_bam
+CUFF.1.1	182