diff htseq-count.xml @ 10:5d969cb56112

Version 0.3 - paried-end sorting is now built-in (uses Picard tools)
author lparsons
date Fri, 07 Dec 2012 14:35:44 -0500
parents 971e20519fb8
children f320093f1e8e
line wrap: on
line diff
--- a/htseq-count.xml	Fri Oct 26 15:57:08 2012 -0400
+++ b/htseq-count.xml	Fri Dec 07 14:35:44 2012 -0500
@@ -1,10 +1,11 @@
-<tool id="htseq_count" name="htseq-count" version="0.2.1">
+<tool id="htseq_count" name="htseq-count" version="0.3">
     <description> - Count aligned reads in a BAM file that overlap features in a GFF file</description>
     <version_command>htseq-count -h | grep version | sed 's/^\(.*\)*\(version .*\)\./\2/'</version_command>
     <requirements>
         <requirement type="package" version="1.6.2">numpy</requirement>
         <requirement type="package" version="0.5.3p9">htseq</requirement>
         <requirement type="package" version="0.1.18">samtools</requirement>
+        <requirement type="package" version="1.56.0">picard</requirement> 
     </requirements>
     <command>
     ##set up input files
@@ -17,9 +18,16 @@
             #set $reference_fasta_filename = str( $samout_conditional.reference_source.ref_file.fields.path )
         #end if
     #end if
-
-    #if $samfile.extension == "bam":
-        samtools view $samfile | 
+    #if str($singlepaired) == "paired":
+        ln -s $samfile local_input.sam &amp;&amp;
+        java -Xmx2G -jar "\$JAVA_JAR_PATH/SortSam.jar" VALIDATION_STRINGENCY=LENIENT SORT_ORDER=queryname O=prepared_input.sam I=local_input.sam TMP_DIR="${__new_file_path__}" 
+        || echo "Error running Picard MergeSamFiles" &gt;&amp;2 &amp;&amp;
+    #else:
+        #if $samfile.extension == "bam":
+            samtools view $samfile | 
+        #else
+            ln -s $samfile prepared_input.sam &amp;&amp;
+        #end if
     #end if
     htseq-count 
     --mode=$mode 
@@ -30,19 +38,26 @@
     #if $samout_conditional.samout:
         --samout=$__new_file_path__/${samoutfile.id}_tmp
     #end if
-    #if $samfile.extension == "bam":
-        - 
-    #else
-        $samfile 
-    #end if
+    #if str($singlepaired) == "paired":
+        prepared_input.sam
+    #else:
+        #if $samfile.extension == "bam":
+            - 
+        #else:
+            prepared_input.sam
+        #end if
+    #end if    
     $gfffile 
     | awk '{if ($1 ~ "no_feature|ambiguous|too_low_aQual|not_aligned|alignment_not_unique") print $0 | "cat 1>&amp;2"; else print $0}' &gt; $counts 2&gt;$othercounts
     #if $samout_conditional.samout:
         &amp;&amp; samtools view -Su -t ${reference_fasta_filename}.fai $__new_file_path__/${samoutfile.id}_tmp | samtools sort -o - sorted > $samoutfile
     #end if</command>
     <inputs>
-        <param format="sam, bam" name="samfile" type="data" label="Aligned SAM/BAM File">
-            <help>Paired-End data MUST be sorted by QUERY NAME, use "NGS: Picard - Paired Read Mate Fixer" to sort by QUERY NAME and output to SAM (not BAM) before using this tool on paired data.</help>
+        <param format="sam, bam" name="samfile" type="data" label="Aligned SAM/BAM File"/>
+        <param name="singlepaired" type="select" label="Is this library mate-paired?">
+            <help>Paired libraries will be sorted by read name prior to counting.</help>
+            <option value="single" selected="true">single-end</option>
+            <option value="paired">paired-end</option>
         </param>
         <param format="gff" name="gfffile" type="data" label="GFF File"/>
         <param name="mode" type="select" label="Mode">
@@ -93,9 +108,9 @@
     </inputs>
 
     <outputs>
-        <data format="tabular" name="counts" label="${tool.name} on ${on_string}"/>
-        <data format="tabular" name="othercounts" label="${tool.name} on ${on_string} (no feature)"/>
-        <data format="bam" name="samoutfile" label="${tool.name} on ${on_string} (BAM)">
+        <data format="tabular" name="counts" metadata_source="samfile" label="${tool.name} on ${on_string}"/>
+        <data format="tabular" name="othercounts" metadata_source="samfile" label="${tool.name} on ${on_string} (no feature)"/>
+        <data format="bam" name="samoutfile" metadata_source="samfile" label="${tool.name} on ${on_string} (BAM)">
             <filter>samout_conditional['samout']</filter>
         </data>
     </outputs>
@@ -107,6 +122,7 @@
         <regex match="Error: Feature (.+) does not contain a '(.+)' attribute" source="both" level="fatal" description="Error parsing the GFF file, at least one feature of the specified 'Feature type' does not have a value for the specified 'ID Attribute'" />
         <regex match="Error occured in line (\d+) of file" source="stderr" level="fatal" description="Unknown error parsing the GFF file" />
         <regex match="Error" source="stderr" level="fatal" description="Unknown error occured" />
+        <regex match="Warning: Read (.+) claims to have an aligned mate which could not be found. \(Is the SAM file properly sorted\?\)" source="stderr" level="warning" description="PAIRED DATA MISSING OR NOT PROPERLY SORTED. Try reruning and selecting the paired-end option. See stderr output of this dataset for more information." />
     </stdio>
 
     <tests>
@@ -124,6 +140,14 @@
             <output name="counts" file="htseq-test_counts.tsv" />
             <output name="othercounts" file="htseq-test_othercounts.tsv" />
         </test>
+        <test>
+            <param name="samfile" value="htseq-test-paired.bam" />
+            <param name="singlepaired" value="paired" />
+            <param name="gfffile" value="htseq-test.gff" />
+            <param name="samout" value="False" />
+            <output name="counts" file="htseq-test-paired_counts.tsv" />
+            <output name="othercounts" file="htseq-test-paired_othercounts.tsv" />
+        </test>
         <!-- Seems to be an issue setting the $reference_fasta_filename variable during test
         <test>
             <param name="samfile" value="htseq-test.sam" />