view cactus_cactus.xml @ 7:6ae5a7b5a0a2 draft

planemo upload for repository https://github.com/usegalaxy-au/tools-au commit 1a58313d07c285370daf8ae223f19fec2d852cea
author galaxy-australia
date Thu, 09 Feb 2023 01:45:56 +0000
parents a74dbfee5a86
children 2263989660f5
line wrap: on
line source

<tool id="cactus_cactus" name="Cactus" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@" license="MIT">
    <description>whole-genome multiple sequence alignment</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="xrefs"/>
    <expand macro="requirements"/>
    <command detect_errors="exit_code"><![CDATA[
        export TMPDIR=\${_GALAXY_JOB_TMP_DIR} &&

        ## Set up seqfile
        
        #if $aln_mode.aln_mode_select == 'interspecies':
            cat $aln_mode.in_tree >> seqfile.txt &&
        #end if
        #for $seq in $in_seqs:
            #set seq_fn = str($seq.label) + '.' + $seq.fasta.ext
            ln -s '$seq.fasta' '$seq_fn' &&
            printf '%s %s\n' '$seq.label' '$seq_fn' >> seqfile.txt 
            &&
        #end for

        ## Run cactus

        #if $aln_mode.aln_mode_select == 'intraspecies':
            ## If we're doing a pangenome, we need to run the steps manually
            cactus-minigraph 
            --binariesMode local
            --mapCores \${GALAXY_SLOTS:-4}
            --maxMemory \${GALAXY_MEMORY_MB:-8192}M
            --reference $aln_mode.ref_level
            --workDir ./
            ./jobStore
            ./seqfile.txt
            pangenome.gfa
            &&
            cactus-graphmap
            --binariesMode local
            --maxCores  \${GALAXY_SLOTS:-4}
            --maxMemory \${GALAXY_MEMORY_MB:-8192}M
            --outputFasta pangenome.gfa.fa
            --reference $aln_mode.ref_level
            --workDir ./
            ./jobStore
            ./seqfile.txt
            pangenome.gfa
            pangenome.paf 
            &&
            cactus-align
            --binariesMode local
            --maxCores  \${GALAXY_SLOTS:-4}
            --maxMemory \${GALAXY_MEMORY_MB:-8192}M
            --outVG
            --pangenome
            --reference $aln_mode.ref_level
            --workDir ./
            ./jobStore
            ./seqfile.txt
            pangenome.paf 
            alignment.hal
            &&
            cactus-graphmap-join
            --binariesMode local
            --giraffe
            --maxCores  \${GALAXY_SLOTS:-4}
            --maxMemory \${GALAXY_MEMORY_MB:-8192}M
            --outName alignment
            --reference $aln_mode.ref_level
            --vg alignment.vg
            --outDir ./
            ./jobStore
        #else if $aln_mode.aln_mode_select == 'interspecies':
            ## Run cactus normally
            cactus 
            --binariesMode local 
            --maxCores  \${GALAXY_SLOTS:-4}
            --maxMemory \${GALAXY_MEMORY_MB:-8192}M
            --workDir ./
            jobStore seqfile.txt alignment.hal 
        #end if

    ]]></command>
    <inputs>
        <conditional name="aln_mode">
            <param name="aln_mode_select" type="select" label="Alignment mode" help="The taxonomic relationship between input genomes. If genomes are from multiple individuals of the same species, select 'Within-species'">
                <option value="interspecies" selected="true">Between-species</option>
                <option value="intraspecies">Within-species</option>
            </param>
            <when value="interspecies">
                <param name="in_tree" type="data" format="nhx" label="Guide tree" help="Phylogenetic tree in Newick format. Required by Cactus to achieve linear scaling with number of input genomes" />
            </when>
            <when value="intraspecies">
                <param name="ref_level" type="text" value="" label="Reference genome" help="Pangenomes from Minigraph-Cactus depend on a predetermined reference genome. Specify one of the Input Genomes as the reference genome. This must match the label used in 'Genome Label'.">
                    <sanitizer invalid_char="">
                        <valid initial="string.letters,string.digits">
                            <add value="_" />
                        </valid>
                    </sanitizer>
                    <validator type="regex">[0-9a-zA-Z_]+</validator>
                </param>
            </when>
        </conditional>
        <repeat name="in_seqs" title="Input genome">
            <param name="label" type="text" value="" label="Genome label" help="NO SPACES. Must match a label in the guide tree.">
                <sanitizer invalid_char="">
                    <valid initial="string.letters,string.digits">
                        <add value="_" />
                    </valid>
                </sanitizer>
                <validator type="regex">[0-9a-zA-Z_]+</validator>
            </param>
            <param name="fasta" type="data" format="fasta,fasta.gz" label="Genome Sequence" help="Input genome"/>
        </repeat>

        <!-- add an option for root -->
        <!-- root mr  -->
    </inputs>
    <outputs>
        <data name="out_hal" format="h5" from_work_dir="alignment.hal" label="${tool.name} on ${on_string} (HAL file)" />
        <data name="out_gfa" format="gfa2.gz" from_work_dir="alignment.gfa.gz" label="${tool.name} on ${on_string} (GFA file)" >
            <filter>aln_mode['aln_mode_select'] == 'intraspecies'</filter>
        </data>
    </outputs>
    <tests>
        <!-- test interspecies mode -->
        <test expect_num_outputs="1">
            <conditional name="aln_mode">
                <param name="aln_mode_select" value="interspecies"/>
                <param name="in_tree" value="test_tree.nhx"/>
            </conditional>
            <repeat name="in_seqs">
                <param name="label" value="simCow_chr6"/>
                <param name="fasta" value="simCow_chr6.fasta"/>
            </repeat>
            <repeat name="in_seqs">
                <param name="label" value="simDog_chr6"/>
                <param name="fasta" value="simDog_chr6.fasta"/>
            </repeat>
            <repeat name="in_seqs">
                <param name="label" value="simHuman_chr6"/>
                <param name="fasta" value="simHuman_chr6.fasta"/>
            </repeat>
            <repeat name="in_seqs">
                <param name="label" value="simMouse_chr6"/>
                <param name="fasta" value="simMouse_chr6.fasta"/>
            </repeat>
            <repeat name="in_seqs">
                <param name="label" value="simRat_chr6"/>
                <param name="fasta" value="simRat_chr6.fasta"/>
            </repeat>
            <output name="out_hal">
                <assert_contents>
                    <has_size value="4472551" delta="200000" />
                </assert_contents>
            </output>
        </test>
        <!-- within-species mode -->
        <test expect_num_outputs="2">
            <conditional name="aln_mode">
                <param name="aln_mode_select" value="intraspecies"/>
                <param name="ref_level" value="simCow_chr6"/>
            </conditional>
            <repeat name="in_seqs">
                <param name="label" value="simCow_chr6"/>
                <param name="fasta" value="simCow_chr6.fasta"/>
            </repeat>
            <repeat name="in_seqs">
                <param name="label" value="simDog_chr6"/>
                <param name="fasta" value="simDog_chr6.fasta"/>
            </repeat>
            <repeat name="in_seqs">
                <param name="label" value="simHuman_chr6"/>
                <param name="fasta" value="simHuman_chr6.fasta"/>
            </repeat>
            <repeat name="in_seqs">
                <param name="label" value="simMouse_chr6"/>
                <param name="fasta" value="simMouse_chr6.fasta"/>
            </repeat>
            <repeat name="in_seqs">
                <param name="label" value="simRat_chr6"/>
                <param name="fasta" value="simRat_chr6.fasta"/>
            </repeat>
            <output name="out_hal">
                <assert_contents>
                    <has_size value="2088959" delta="200000" />
                </assert_contents>
            </output>
            <output name="out_gfa">
                <assert_contents>
                    <has_size value="173000" delta="200000" />
                </assert_contents>
            </output>
        </test>
        <!-- compressed input -->
        <test expect_num_outputs="2">
            <conditional name="aln_mode">
                <param name="aln_mode_select" value="intraspecies"/>
                <param name="ref_level" value="germ_25"/>
            </conditional>
            <repeat name="in_seqs">
                <param name="label" value="germ_25"/>
                <param name="fasta" value="germ_25.fasta.gz"/>
            </repeat>
            <repeat name="in_seqs">
                <param name="label" value="vulg_25"/>
                <param name="fasta" value="vulg_25.fasta.gz"/>
            </repeat>
            <repeat name="in_seqs">
                <param name="label" value="pens_25"/>
                <param name="fasta" value="pens_25.fasta.gz"/>
            </repeat>
            <output name="out_hal">
                <assert_contents>
                    <has_size value="7420424" delta="200000" />
                </assert_contents>
            </output>
            <output name="out_gfa">
                <assert_contents>
                    <has_size value="5786846" delta="200000" />
                </assert_contents>
            </output>
        </test>
        <!-- FASTA header with spaces (used to fail) -->
        <test expect_num_outputs="2">
            <conditional name="aln_mode">
                <param name="aln_mode_select" value="intraspecies"/>
                <param name="ref_level" value="badheader1"/>
            </conditional>
            <repeat name="in_seqs">
                <param name="label" value="badheader1"/>
                <param name="fasta" value="bh1.fasta.gz"/>
            </repeat>
            <repeat name="in_seqs">
                <param name="label" value="badheader2"/>
                <param name="fasta" value="bh2.fasta.gz"/>
            </repeat>
            <output name="out_hal">
                <assert_contents>
                    <has_size value="3382274" delta="200000" />
                </assert_contents>
            </output>
            <output name="out_gfa">
                <assert_contents>
                    <has_size value="764748" delta="200000" />
                </assert_contents>
            </output>
        </test>
    </tests>
        <help><![CDATA[

.. class:: infomark

**What it does**

`Cactus <https://github.com/ComparativeGenomicsToolkit/cactus>`__ is a
reference-free whole-genome multiple alignment program. It can be used
to progressively align a large number of genomes.

-----

.. class:: infomark

**Usage**

**Between-species mode (Progressive Cactus)**

If you are aligning genomes from **multiple species**, you need to
provide a guide tree in Newick format. Cactus uses the guide tree to
progressively align genomes, meaning that it doesn’t need to align all
possible pairs of genomes.

A Newick-formatted tree for human, chimp and gorilla genomes looks like
this:

    ::

        (((human:0.006,chimp:0.006667):0.0022,gorilla:0.008825):0.0096,orang:0.01831);

The numbers are the branch lengths.

**Within-species mode (Minigraph-Cactus)**

You can also run Cactus in `pangenome
mode <https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/pangenome.md>`__
to align genomes of multiple individuals from the **same species**. In
this mode you will not use a guide tree. Cactus will use
`minigraph <https://github.com/lh3/minigraph>`__ to generate a graph of
the input genomes and then use the graph to order the alignments. To use
pangenome mode, select ‘Within-species’ in the ‘Alignment mode’
dropdown.

Unlike Between-species mode, Within-species mode depends on a predetermined reference genome.

-----

.. class:: infomark

**Input**

The developers recommend soft-masking your genomes with RepeatMasker
before running Cactus. RepeatMasker is available on Galaxy.

If you’re using Between-species mode, you need to provide labels for the
fasta files that match the leaves on the guide tree. In the example
above, you would use the label ‘human’ for the human fasta file.

-----

.. class:: infomark

**Output**

The main output of Cactus is in `HAL
format <https://github.com/ComparativeGenomicsToolkit/cactus#using-the-output>`__.
You can use the `Cactus: export <root?tool_id=cactus_export>`__ tool to
convert the Cactus output to a VG or Multiple Alignment Format (MAF)
file.


        ]]></help>
    <expand macro="citations"/>
</tool>