diff cactus_cactus.xml @ 0:85f68b344286 draft

"planemo upload for repository https://github.com/usegalaxy-au/tools-au commit 8f8363625623f2ff3f04d12d227673ac134eba24"
author galaxy-australia
date Mon, 04 Apr 2022 06:27:44 +0000
parents
children 1bc1199f0ff4
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cactus_cactus.xml	Mon Apr 04 06:27:44 2022 +0000
@@ -0,0 +1,268 @@
+<tool id="cactus_cactus" name="Cactus" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@" license="MIT">
+    <description>whole-genome multiple sequence alignment.</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="xrefs"/>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+        ## Check the FASTA headers
+        ## This is only necessary in pangenome mode
+        #if $aln_mode.aln_mode_select == 'intraspecies':
+            #for $seq in $in_seqs:
+                if
+                #if $seq.fasta.is_of_type('fasta.gz'):
+                    zgrep
+                #else
+                    grep
+                #end if
+                "^>" $seq.fasta | grep -q "[[:space:]]" ; then
+                    echo "Error parsing input FASTA." ;
+                    echo "Pangenome mode fails if there are spaces in the header." ;
+                    echo "Please remove them with the NormalizeFasta tool." ;
+                    exit 1 
+                    ; fi &&
+            #end for
+        #end if
+
+        ## Set up seqfile
+        
+        #if $aln_mode.aln_mode_select == 'interspecies':
+            cat $aln_mode.in_tree >> seqfile.txt &&
+        #end if
+        #set seq_line = ''
+        #for $seq in $in_seqs:
+            #set seq_fn = str($seq.label) + '.' + $seq.fasta.ext
+            ln -s '$seq.fasta' '$seq_fn' &&
+            printf '%s %s\n' '$seq.label' '$seq_fn' >> seqfile.txt 
+            #set seq_line += $seq_fn + ' '
+            &&
+        #end for
+
+        ## Run cactus
+
+        #if $aln_mode.aln_mode_select == 'intraspecies':
+            ## If we're doing a pangenome, we need to run the steps manually
+            minigraph -xggs
+            -t \${GALAXY_SLOTS:-4}
+            $seq_line
+            > pangenome.gfa
+            &&
+            cactus-graphmap
+            --maxCores  \${GALAXY_SLOTS:-4}
+            --maxMemory \${GALAXY_MEMORY_MB:-8192}M
+            ./jobStore
+            ./seqfile.txt
+            pangenome.gfa
+            pangenome.paf 
+            --outputFasta pangenome.gfa.fa
+            --binariesMode local
+            --workDir ./
+            &&
+            cactus-align
+            --maxCores  \${GALAXY_SLOTS:-4}
+            --maxMemory \${GALAXY_MEMORY_MB:-8192}M
+            ./jobStore
+            ./seqfile.txt
+            pangenome.paf 
+            alignment.hal
+            --pangenome
+            --pafInput
+            --binariesMode local
+            --workDir ./
+        #else if $aln_mode.aln_mode_select == 'interspecies':
+            ## Run cactus normally
+            cactus 
+            --maxCores  \${GALAXY_SLOTS:-4}
+            --maxMemory \${GALAXY_MEMORY_MB:-8192}M
+            jobStore seqfile.txt alignment.hal 
+            --binariesMode local 
+            --workDir ./
+        #end if
+
+    ]]></command>
+    <inputs>
+        <conditional name="aln_mode">
+            <param name="aln_mode_select" type="select" label="Alignment mode" help="The taxonomic relationship between input genomes. If genomes are from multiple individuals of the same species, select 'Within-species'">
+                <option value="interspecies" selected="true">Between-species</option>
+                <option value="intraspecies">Within-species</option>
+            </param>
+            <when value="interspecies">
+                <param name="in_tree" type="data" format="nhx" label="Guide tree" help="Phylogenetic tree in Newick format. Required by Cactus to achieve linear scaling with number of input genomes" />
+            </when>
+            <when value="intraspecies">
+            </when>
+        </conditional>
+        <repeat name="in_seqs" title="Input genome">
+            <param name="label" type="text" value="" label="Genome Label" help="NO SPACES. Must match a label in the guide tree.">
+            </param>
+            <param name="fasta" type="data" format="fasta,fasta.gz" label="Genome Sequence" help="Input genome"/>
+        </repeat>
+        <!-- add an option for root -->
+        <!-- root mr  -->
+    </inputs>
+    <outputs>
+        <data name="out_hal" format="h5" from_work_dir="alignment.hal" label="${tool.name} on ${on_string} (HAL file)" />
+    </outputs>
+    <tests>
+        <!-- test interspecies mode -->
+        <test expect_num_outputs="1">
+            <conditional name="aln_mode">
+                <param name="aln_mode_select" value="interspecies"/>
+                <param name="in_tree" value="test_tree.nhx"/>
+            </conditional>
+            <repeat name="in_seqs">
+                <param name="label" value="simCow_chr6"/>
+                <param name="fasta" value="simCow_chr6.fasta"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="simDog_chr6"/>
+                <param name="fasta" value="simDog_chr6.fasta"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="simHuman_chr6"/>
+                <param name="fasta" value="simHuman_chr6.fasta"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="simMouse_chr6"/>
+                <param name="fasta" value="simMouse_chr6.fasta"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="simRat_chr6"/>
+                <param name="fasta" value="simRat_chr6.fasta"/>
+            </repeat>
+            <output name="out_hal">
+                <assert_contents>
+                    <has_size value="5272354" delta="200000" />
+                </assert_contents>
+            </output>
+        </test>
+        <!-- within-species mode -->
+        <test expect_num_outputs="1">
+            <conditional name="aln_mode">
+                <param name="aln_mode_select" value="intraspecies"/>
+            </conditional>
+            <repeat name="in_seqs">
+                <param name="label" value="simCow_chr6"/>
+                <param name="fasta" value="simCow_chr6.fasta"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="simDog_chr6"/>
+                <param name="fasta" value="simDog_chr6.fasta"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="simHuman_chr6"/>
+                <param name="fasta" value="simHuman_chr6.fasta"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="simMouse_chr6"/>
+                <param name="fasta" value="simMouse_chr6.fasta"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="simRat_chr6"/>
+                <param name="fasta" value="simRat_chr6.fasta"/>
+            </repeat>
+            <output name="out_hal">
+                <assert_contents>
+                    <has_size value="2119332" delta="200000" />
+                </assert_contents>
+            </output>
+        </test>
+        <!-- compressed input -->
+        <test expect_num_outputs="1">
+            <conditional name="aln_mode">
+                <param name="aln_mode_select" value="intraspecies"/>
+            </conditional>
+            <repeat name="in_seqs">
+                <param name="label" value="germ_25"/>
+                <param name="fasta" value="germ_25.fasta.gz"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="vulg_25"/>
+                <param name="fasta" value="vulg_25.fasta.gz"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="pens_25"/>
+                <param name="fasta" value="pens_25.fasta.gz"/>
+            </repeat>
+            <output name="out_hal">
+                <assert_contents>
+                    <has_size value="7204260" delta="200000" />
+                </assert_contents>
+            </output>
+        </test>
+        <!-- FASTA header -->
+        <test expect_exit_code="1" expect_failure="true">
+            <conditional name="aln_mode">
+                <param name="aln_mode_select" value="intraspecies"/>
+            </conditional>
+            <repeat name="in_seqs">
+                <param name="label" value="badheader1"/>
+                <param name="fasta" value="bh1.fasta.gz"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="badheader2"/>
+                <param name="fasta" value="bh2.fasta.gz"/>
+            </repeat>
+        </test>
+    </tests>
+        <help><![CDATA[
+**What it does**
+
+`Cactus <https://github.com/ComparativeGenomicsToolkit/cactus>`__ is a
+reference-free whole-genome multiple alignment program. It can be used
+to progressively align a large number of genomes.
+
+**Usage**
+
+**Between-species mode**
+
+If you are aligning genomes from **multiple species**, you need to
+provide a guide tree in Newick format. Cactus uses the guide tree to
+progressively align genomes, meaning that it doesn’t need to align all
+possible pairs of genomes.
+
+A Newick-formatted tree for human, chimp and gorilla genomes looks like
+this:
+
+::
+
+   (((human:0.006,chimp:0.006667):0.0022,gorilla:0.008825):0.0096,orang:0.01831);
+
+The numbers are the branch lengths.
+
+**Beta: Within-species mode**
+
+You can also run Cactus in `pangenome
+mode <https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/pangenome.md>`__
+to align genomes of multiple individuals from the **same species**. In
+this mode you will not use a guide tree. Cactus will use
+`minigraph <https://github.com/lh3/minigraph>`__ to generate a graph of
+the input genomes and then use the graph to order the alignments. To use
+pangenome mode, select ‘Within-species’ in the ‘Alignment mode’
+dropdown.
+
+⚠️ To use pangenome mode, you will have to remove spaces from the headers in your FASTA file.
+You can do this with the NormalizeFasta tool.
+
+**Input**
+
+The developers recommend soft-masking your genomes with RepeatMasker
+before running Cactus. RepeatMasker is available on Galaxy.
+
+If you’re using Between-species mode, you need to provide labels for the
+fasta files that match the leaves on the guide tree. In the example
+above, you would use the label ‘human’ for the human fasta file.
+
+**Output**
+
+The main output of Cactus is in `HAL
+format <https://github.com/ComparativeGenomicsToolkit/cactus#using-the-output>`__.
+You can use the `Cactus: export <root?tool_id=cactus_export>`__ tool to
+convert the Cactus output to a VG or Multiple Alignment Format (MAF)
+file.
+
+
+        ]]></help>
+    <expand macro="citations"/>
+</tool>