diff cactus_cactus.xml @ 0:51c3c42bc644 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/cactus commit 827619d22d2931d8fb34ed6844cfa91433e1ac2c
author iuc
date Tue, 06 Feb 2024 00:30:39 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/cactus_cactus.xml	Tue Feb 06 00:30:39 2024 +0000
@@ -0,0 +1,260 @@
+<tool id="cactus_cactus" name="Cactus" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@" license="MIT">
+    <description>whole-genome multiple sequence alignment</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="xrefs"/>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+
+        ## Set up seqfile
+        
+        #if $aln_mode.aln_mode_select == 'interspecies':
+            cat $aln_mode.in_tree >> seqfile.txt &&
+        #end if
+        #for $seq in $in_seqs:
+            #set seq_fn = str($seq.label) + '.' + $seq.fasta.ext
+            ln -s '$seq.fasta' '$seq_fn' &&
+            printf '%s %s\n' '$seq.label' '$seq_fn' >> seqfile.txt 
+            &&
+        #end for
+
+        ## Run cactus
+
+        #if $aln_mode.aln_mode_select == 'intraspecies':
+            ## Run cactus-pangenome
+            ## --reference should be the first argument
+            ## https://github.com/ComparativeGenomicsToolkit/cactus/issues/1093#issuecomment-1620088688
+            cactus-pangenome 
+            --reference $aln_mode.ref_level
+            --binariesMode local
+            --maxCores \${GALAXY_SLOTS:-4}
+            --maxMemory \${GALAXY_MEMORY_MB:-16384}M    
+            --outDir ./
+            --outName alignment
+            jobStore
+            seqfile.txt
+        #else if $aln_mode.aln_mode_select == 'interspecies':
+            ## Run cactus normally
+            cactus 
+            --binariesMode local 
+            --maxCores \${GALAXY_SLOTS:-4}
+            --maxMemory \${GALAXY_MEMORY_MB:-16384}M
+            --workDir ./
+            jobStore
+            seqfile.txt
+            alignment.full.hal 
+        #end if
+
+    ]]></command>
+    <inputs>
+        <conditional name="aln_mode">
+            <param name="aln_mode_select" type="select" label="Alignment mode" help="The taxonomic relationship between input genomes. If genomes are from multiple individuals of the same species, select 'Within-species'">
+                <option value="interspecies" selected="true">Between-species</option>
+                <option value="intraspecies">Within-species</option>
+            </param>
+            <when value="interspecies">
+                <param name="in_tree" type="data" format="nhx" label="Guide tree" help="Phylogenetic tree in Newick format. Required by Cactus to achieve linear scaling with number of input genomes"/>
+            </when>
+            <when value="intraspecies">
+                <param name="ref_level" type="text" value="" label="Reference genome" help="Pangenomes from Minigraph-Cactus depend on a predetermined reference genome. Specify one of the Input Genomes as the reference genome. This must match the label used in 'Genome Label'.">
+                    <sanitizer invalid_char="">
+                        <valid initial="string.letters,string.digits">
+                            <add value="_"/>
+                        </valid>
+                    </sanitizer>
+                    <validator type="regex">[0-9a-zA-Z_]+</validator>
+                </param>
+            </when>
+        </conditional>
+        <repeat name="in_seqs" title="Input genome">
+            <param name="label" type="text" value="" label="Genome label" help="NO SPACES. Must match a label in the guide tree.">
+                <sanitizer invalid_char="">
+                    <valid initial="string.letters,string.digits">
+                        <add value="_"/>
+                    </valid>
+                </sanitizer>
+                <validator type="regex">[0-9a-zA-Z_]+</validator>
+            </param>
+            <param name="fasta" type="data" format="fasta,fasta.gz" label="Genome Sequence" help="Input genome"/>
+        </repeat>
+        <!-- add an option for root -->
+        <!-- root mr  -->
+    </inputs>
+    <outputs>
+        <data name="out_hal" format="h5" from_work_dir="alignment.full.hal" label="${tool.name} on ${on_string} (HAL file)">
+        </data>
+        <data name="out_gfa" format="gfa2.gz" from_work_dir="alignment.gfa.gz" label="${tool.name} on ${on_string} (GFA file)">
+            <filter>aln_mode['aln_mode_select'] == 'intraspecies'</filter>
+        </data>
+    </outputs>
+    <tests>
+        <!-- test interspecies mode -->
+        <test expect_num_outputs="1">
+            <conditional name="aln_mode">
+                <param name="aln_mode_select" value="interspecies"/>
+                <param name="in_tree" value="test_tree.nhx"/>
+            </conditional>
+            <repeat name="in_seqs">
+                <param name="label" value="simCow_chr6"/>
+                <param name="fasta" value="simCow_chr6.fasta"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="simDog_chr6"/>
+                <param name="fasta" value="simDog_chr6.fasta"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="simHuman_chr6"/>
+                <param name="fasta" value="simHuman_chr6.fasta"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="simMouse_chr6"/>
+                <param name="fasta" value="simMouse_chr6.fasta"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="simRat_chr6"/>
+                <param name="fasta" value="simRat_chr6.fasta"/>
+            </repeat>
+            <output name="out_hal">
+                <assert_contents>
+                    <has_size value="4472551" delta="200000"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- within-species mode -->
+        <test expect_num_outputs="2">
+            <conditional name="aln_mode">
+                <param name="aln_mode_select" value="intraspecies"/>
+                <param name="ref_level" value="simCow_chr6"/>
+            </conditional>
+            <repeat name="in_seqs">
+                <param name="label" value="simCow_chr6"/>
+                <param name="fasta" value="simCow_chr6.fasta"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="simDog_chr6"/>
+                <param name="fasta" value="simDog_chr6.fasta"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="simHuman_chr6"/>
+                <param name="fasta" value="simHuman_chr6.fasta"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="simMouse_chr6"/>
+                <param name="fasta" value="simMouse_chr6.fasta"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="simRat_chr6"/>
+                <param name="fasta" value="simRat_chr6.fasta"/>
+            </repeat>
+            <output name="out_hal">
+                <assert_contents>
+                    <has_size value="565214" delta="65214"/>
+                </assert_contents>
+            </output>
+            <output name="out_gfa">
+                <assert_contents>
+                    <has_size value="173000" delta="200000"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- FASTA header with spaces (used to fail) -->
+        <test expect_num_outputs="2">
+            <conditional name="aln_mode">
+                <param name="aln_mode_select" value="intraspecies"/>
+                <param name="ref_level" value="badheader1"/>
+            </conditional>
+            <repeat name="in_seqs">
+                <param name="label" value="badheader1"/>
+                <param name="fasta" value="bh1.fasta.gz"/>
+            </repeat>
+            <repeat name="in_seqs">
+                <param name="label" value="badheader2"/>
+                <param name="fasta" value="bh2.fasta.gz"/>
+            </repeat>
+            <output name="out_hal">
+                <assert_contents>
+                    <has_size value="3382274" delta="200000"/>
+                </assert_contents>
+            </output>
+            <output name="out_gfa">
+                <assert_contents>
+                    <has_size value="764748" delta="200000"/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+.. class:: infomark
+
+**What it does**
+
+`Cactus <https://github.com/ComparativeGenomicsToolkit/cactus>`__ is a
+reference-free whole-genome multiple alignment program. It can be used
+to progressively align a large number of genomes.
+
+-----
+
+.. class:: infomark
+
+**Usage**
+
+**Between-species mode (Progressive Cactus)**
+
+If you are aligning genomes from **multiple species**, you need to
+provide a guide tree in Newick format. Cactus uses the guide tree to
+progressively align genomes, meaning that it doesn’t need to align all
+possible pairs of genomes.
+
+A Newick-formatted tree for human, chimp and gorilla genomes looks like
+this:
+
+    ::
+
+        (((human:0.006,chimp:0.006667):0.0022,gorilla:0.008825):0.0096,orang:0.01831);
+
+The numbers are the branch lengths.
+
+**Within-species mode (Minigraph-Cactus)**
+
+You can also run Cactus in `pangenome
+mode <https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/pangenome.md>`__
+to align genomes of multiple individuals from the **same species**. In
+this mode you will not use a guide tree. Cactus will use
+`minigraph <https://github.com/lh3/minigraph>`__ to generate a graph of
+the input genomes and then use the graph to order the alignments. To use
+pangenome mode, select ‘Within-species’ in the ‘Alignment mode’
+dropdown.
+
+Unlike Between-species mode, Within-species mode depends on a predetermined reference genome.
+
+-----
+
+.. class:: infomark
+
+**Input**
+
+The developers recommend soft-masking your genomes with RepeatMasker
+before running Cactus. RepeatMasker is available on Galaxy.
+
+If you’re using Between-species mode, you need to provide labels for the
+fasta files that match the leaves on the guide tree. In the example
+above, you would use the label ‘human’ for the human fasta file.
+
+-----
+
+.. class:: infomark
+
+**Output**
+
+The main output of Cactus is in `HAL
+format <https://github.com/ComparativeGenomicsToolkit/cactus#using-the-output>`__.
+You can use the `Cactus: export <root?tool_id=cactus_export>`__ tool to
+convert the Cactus output to a VG or Multiple Alignment Format (MAF)
+file.
+
+
+        ]]></help>
+    <expand macro="citations"/>
+</tool>