changeset 12:9bd1568619cd draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tool_collections/samtools/sam_to_bam commit c5ded4208dd70e88596ddc725795a2401773f02d"
author iuc
date Sat, 27 Nov 2021 12:31:54 +0000
parents 1e69848b596f
children 0e60e2dd20af
files macros.xml sam_to_bam.xml test-data/chr_m.bgzipped_fasta.gz test-data/chr_m.fasta.gz
diffstat 4 files changed, 62 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/macros.xml	Tue Sep 28 16:11:24 2021 +0000
+++ b/macros.xml	Sat Nov 27 12:31:54 2021 +0000
@@ -50,23 +50,56 @@
         #end for
     ]]></token>
     <token name="@PREPARE_FASTA_IDX@"><![CDATA[
-        ##checks for reference data ($addref_cond.addref_select=="history" or =="cached")
-        ##and sets the -t/-T parameters accordingly:
-        ##- in case of history a symbolic link is used because samtools (view) will generate
-        ##  the index which might not be possible in the directory containing the fasta file
-        ##- in case of cached the absolute path is used which allows to read the cram file
-        ##  without specifying the reference
+        ## Make the user-selected reference genome, if any, accessible through
+        ## a shell variable $reffa, index the reference if necessary, and make
+        ## the fai-index file available through a shell variable $reffai.
+
+        ## For a cached genome simply sets the shell variables to point to the
+        ## genome file and its precalculated index.
+        ## For a genome from the user's history, if that genome is a plain
+        ## fasta file, the code creates a symlink in the pwd, creates the fai
+        ## index file next to it, then sets the shell variables to point to the
+        ## symlink and its index.
+        ## For a fasta.gz dataset from the user's history, it tries the same,
+        ## but this will only succeed if the file got compressed with bgzip.
+        ## For a regular gzipped file samtools faidx will fail, in which case
+        ## the code falls back to decompressing to plain fasta before
+        ## reattempting the indexing.
+        ## Indexing of a bgzipped file produces a regular fai index file *and*
+        ## a compressed gzi file. The former is identical to the fai index of
+        ## the uncompressed fasta.
+
+        ## If the user has not selected a reference (it's an optional parameter
+        ## in some samtools wrappers), a cheetah boolean use_ref is set to
+        ## False to encode that fact.
+
+        #set use_ref=True
         #if $addref_cond.addref_select == "history":
-            ln -s '${addref_cond.ref}' reference.fa &&
-            samtools faidx reference.fa &&
-            #set reffa="reference.fa"
-            #set reffai="reference.fa.fai"
+            #if $addref_cond.ref.is_of_type('fasta'):
+                reffa="reference.fa" &&
+                ln -s '${addref_cond.ref}' \$reffa &&
+                samtools faidx \$reffa &&
+            #else:
+                reffa="reference.fa.gz" &&
+                ln -s '${addref_cond.ref}' \$reffa &&
+                {
+                    samtools faidx \$reffa ||
+                    {
+                        echo "Failed to index compressed reference. Trying decompressed ..." 1>&2 &&
+                        gzip -dc \$reffa > reference.fa &&
+                        reffa="reference.fa" &&
+                        samtools faidx \$reffa;
+                    }
+                } &&
+            #end if
+            reffai=\$reffa.fai &&
         #elif $addref_cond.addref_select == "cached":
-            #set reffa=str($addref_cond.ref.fields.path)
-            #set reffai=str($addref_cond.ref.fields.path)+".fai"
+            ## in case of cached the absolute path is used which allows to read 
+            ## a cram file  without specifying the reference
+            reffa='${addref_cond.ref.fields.path}' &&
+            reffai=\$reffa.fai &&
         #else
-            #set reffa=None
-            #set reffai=None
+            #set use_ref=False
         #end if
     ]]></token>
 
--- a/sam_to_bam.xml	Tue Sep 28 16:11:24 2021 +0000
+++ b/sam_to_bam.xml	Sat Nov 27 12:31:54 2021 +0000
@@ -16,7 +16,7 @@
         samtools view
             -b
             -@ \$addthreads
-            -t '$reffai'
+            -t "\$reffai"
             '$input' |
 
         samtools sort
@@ -67,6 +67,20 @@
             <param name="input" ftype="sam" value="sam_to_bam_noheader_in2.sam" />
             <output name="output1" ftype="bam" file="sam_to_bam_out3.bam" lines_diff="4"/>
         </test>
+        <test>
+            <!-- Test direct use of bgzipped reference -->
+            <param name="addref_select" value="history" />
+            <param name="ref" ftype="fasta.gz" dbkey="equCab2" value="chr_m.bgzipped_fasta.gz" />
+            <param name="input" ftype="sam" value="sam_to_bam_noheader_in2.sam" />
+            <output name="output1" ftype="bam" file="sam_to_bam_out3.bam" lines_diff="4"/>
+        </test>
+        <test>
+            <!-- Test with simple gzipped reference, which requires decompression -->
+            <param name="addref_select" value="history" />
+            <param name="ref" ftype="fasta.gz" dbkey="equCab2" value="chr_m.fasta.gz" />
+            <param name="input" ftype="sam" value="sam_to_bam_noheader_in2.sam" />
+            <output name="output1" ftype="bam" file="sam_to_bam_out3.bam" lines_diff="4"/>
+        </test>
     </tests>
     <help><![CDATA[
 **What it does**
Binary file test-data/chr_m.bgzipped_fasta.gz has changed
Binary file test-data/chr_m.fasta.gz has changed