diff bealign.xml @ 1:f9b72a376ec9 draft

"planemo upload for repository https://github.com/davebx/bioext-gx/ commit 9a163dd8880c14f371e2603389f4951881a74b25"
author iuc
date Thu, 13 May 2021 16:57:26 +0000
parents 6ef10b28e967
children d8b6f0adaa79
line wrap: on
line diff
--- a/bealign.xml	Wed May 16 17:34:42 2018 -0400
+++ b/bealign.xml	Thu May 13 16:57:26 2021 +0000
@@ -8,6 +8,15 @@
     <version_command>bealign --version</version_command>
     <command detect_errors="exit_code">
     <![CDATA[
+    ## Some downstream tools, such as the TN-93 clustering tool and RAxML, might
+    ## break if there are non-standard characters in the sequences or text other
+    ## than alphanumerics in the sequence names, so we run the input dataset
+    ## through a simple awk script to remove any non-IUPAC-standard nucleotides
+    ## and replace any unwanted characters in the sequence names with underscores.
+    ## This should not affect the actual alignment, since any non-standard character
+    ## in the sequences is already ignored, but the possibility remains.
+    cat '$input' | awk '{ if (\$0 ~ "^[^>]") {a = gensub(/[^ACGTURYKMSWBDHVNacgturykmswbdhvn?-]/, "", "g"); } else {a=gensub(/[^>A-Za-z0-9_]/, "_", "g"); }; print a } ' |
+        sed 's,_\\+,_,g' > reads.fa &&
     bealign --reference '$select_reference.reference' --alphabet $advanced.alphabet
         #if $advanced.expected_identity:
             --expected-identity $advanced.expected_identity
@@ -16,11 +25,11 @@
         #if $advanced.discard:
             $advanced.discard '$advanced.discarded_reads'
         #end if
-        '$input' '$output'
+        reads.fa alignment.bam
     ]]>
     </command>
     <inputs>
-        <param name="input" type="data" format="fasta" label="Input reads" />
+        <param name="input" type="data" format="fasta" label="Input reads" help="For the benefit of certain tools that depend on this aligner, such as the TN-93 clustering tool, this dataset's sequence names will have non-alphanumeric characters replaced with underscores, and the sequences will be restricted to the set of IUPAC nucleotide characters." />
         <conditional name="select_reference">
             <param name="reference_type" type="select">
                 <option value="preset">Select preset</option>
@@ -72,7 +81,7 @@
         </section>
     </inputs>
     <outputs>
-        <data name="output" format="bam" />
+        <data name="output" format="bam" from_work_dir="alignment.bam" />
         <data name="discarded_reads" format="fasta">
             <filter>advanced['discard']</filter>
         </data>
@@ -83,14 +92,14 @@
             <param name="reference_type" value="dataset" />
             <param name="score_matrix" value="HIV_BETWEEN_F" />
             <param name="reference" ftype="fasta" value="bealign-in-ref-1.fa" />
-            <output name="output" file="bealign-out1.bam" />
+            <output name="output" file="bealign-out1.bam" ftype="bam" />
         </test>
         <test>
             <param name="input" ftype="fasta" value="bealign-in2.fa" />
             <param name="reference_type" value="dataset" />
             <param name="score_matrix" value="BLOSUM62" />
             <param name="reference" ftype="fasta" value="bealign-in-ref-2.fa" />
-            <output name="output" file="bealign-out2.bam" />
+            <output name="output" file="bealign-out2.bam" ftype="bam" />
         </test>
         <test>
             <param name="input" ftype="fasta" value="bealign-in2.fa" />
@@ -98,7 +107,7 @@
             <param name="expected_identity" value="0.9" />
             <param name="score_matrix" value="BLOSUM62" />
             <param name="reference" ftype="fasta" value="bealign-in-ref-2.fa" />
-            <output name="output" file="bealign-out3.bam" />
+            <output name="output" file="bealign-out3.bam" ftype="bam" />
         </test>
     </tests>
     <help>