Mercurial > repos > galaxy-australia > cactus_cactus
diff cactus_cactus.xml @ 0:85f68b344286 draft
"planemo upload for repository https://github.com/usegalaxy-au/tools-au commit 8f8363625623f2ff3f04d12d227673ac134eba24"
author | galaxy-australia |
---|---|
date | Mon, 04 Apr 2022 06:27:44 +0000 |
parents | |
children | 1bc1199f0ff4 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/cactus_cactus.xml Mon Apr 04 06:27:44 2022 +0000 @@ -0,0 +1,268 @@ +<tool id="cactus_cactus" name="Cactus" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@" license="MIT"> + <description>whole-genome multiple sequence alignment.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="xrefs"/> + <expand macro="requirements"/> + <command detect_errors="exit_code"><![CDATA[ + ## Check the FASTA headers + ## This is only necessary in pangenome mode + #if $aln_mode.aln_mode_select == 'intraspecies': + #for $seq in $in_seqs: + if + #if $seq.fasta.is_of_type('fasta.gz'): + zgrep + #else + grep + #end if + "^>" $seq.fasta | grep -q "[[:space:]]" ; then + echo "Error parsing input FASTA." ; + echo "Pangenome mode fails if there are spaces in the header." ; + echo "Please remove them with the NormalizeFasta tool." ; + exit 1 + ; fi && + #end for + #end if + + ## Set up seqfile + + #if $aln_mode.aln_mode_select == 'interspecies': + cat $aln_mode.in_tree >> seqfile.txt && + #end if + #set seq_line = '' + #for $seq in $in_seqs: + #set seq_fn = str($seq.label) + '.' + $seq.fasta.ext + ln -s '$seq.fasta' '$seq_fn' && + printf '%s %s\n' '$seq.label' '$seq_fn' >> seqfile.txt + #set seq_line += $seq_fn + ' ' + && + #end for + + ## Run cactus + + #if $aln_mode.aln_mode_select == 'intraspecies': + ## If we're doing a pangenome, we need to run the steps manually + minigraph -xggs + -t \${GALAXY_SLOTS:-4} + $seq_line + > pangenome.gfa + && + cactus-graphmap + --maxCores \${GALAXY_SLOTS:-4} + --maxMemory \${GALAXY_MEMORY_MB:-8192}M + ./jobStore + ./seqfile.txt + pangenome.gfa + pangenome.paf + --outputFasta pangenome.gfa.fa + --binariesMode local + --workDir ./ + && + cactus-align + --maxCores \${GALAXY_SLOTS:-4} + --maxMemory \${GALAXY_MEMORY_MB:-8192}M + ./jobStore + ./seqfile.txt + pangenome.paf + alignment.hal + --pangenome + --pafInput + --binariesMode local + --workDir ./ + #else if $aln_mode.aln_mode_select == 'interspecies': + ## Run cactus normally + cactus + --maxCores \${GALAXY_SLOTS:-4} + --maxMemory \${GALAXY_MEMORY_MB:-8192}M + jobStore seqfile.txt alignment.hal + --binariesMode local + --workDir ./ + #end if + + ]]></command> + <inputs> + <conditional name="aln_mode"> + <param name="aln_mode_select" type="select" label="Alignment mode" help="The taxonomic relationship between input genomes. If genomes are from multiple individuals of the same species, select 'Within-species'"> + <option value="interspecies" selected="true">Between-species</option> + <option value="intraspecies">Within-species</option> + </param> + <when value="interspecies"> + <param name="in_tree" type="data" format="nhx" label="Guide tree" help="Phylogenetic tree in Newick format. Required by Cactus to achieve linear scaling with number of input genomes" /> + </when> + <when value="intraspecies"> + </when> + </conditional> + <repeat name="in_seqs" title="Input genome"> + <param name="label" type="text" value="" label="Genome Label" help="NO SPACES. Must match a label in the guide tree."> + </param> + <param name="fasta" type="data" format="fasta,fasta.gz" label="Genome Sequence" help="Input genome"/> + </repeat> + <!-- add an option for root --> + <!-- root mr --> + </inputs> + <outputs> + <data name="out_hal" format="h5" from_work_dir="alignment.hal" label="${tool.name} on ${on_string} (HAL file)" /> + </outputs> + <tests> + <!-- test interspecies mode --> + <test expect_num_outputs="1"> + <conditional name="aln_mode"> + <param name="aln_mode_select" value="interspecies"/> + <param name="in_tree" value="test_tree.nhx"/> + </conditional> + <repeat name="in_seqs"> + <param name="label" value="simCow_chr6"/> + <param name="fasta" value="simCow_chr6.fasta"/> + </repeat> + <repeat name="in_seqs"> + <param name="label" value="simDog_chr6"/> + <param name="fasta" value="simDog_chr6.fasta"/> + </repeat> + <repeat name="in_seqs"> + <param name="label" value="simHuman_chr6"/> + <param name="fasta" value="simHuman_chr6.fasta"/> + </repeat> + <repeat name="in_seqs"> + <param name="label" value="simMouse_chr6"/> + <param name="fasta" value="simMouse_chr6.fasta"/> + </repeat> + <repeat name="in_seqs"> + <param name="label" value="simRat_chr6"/> + <param name="fasta" value="simRat_chr6.fasta"/> + </repeat> + <output name="out_hal"> + <assert_contents> + <has_size value="5272354" delta="200000" /> + </assert_contents> + </output> + </test> + <!-- within-species mode --> + <test expect_num_outputs="1"> + <conditional name="aln_mode"> + <param name="aln_mode_select" value="intraspecies"/> + </conditional> + <repeat name="in_seqs"> + <param name="label" value="simCow_chr6"/> + <param name="fasta" value="simCow_chr6.fasta"/> + </repeat> + <repeat name="in_seqs"> + <param name="label" value="simDog_chr6"/> + <param name="fasta" value="simDog_chr6.fasta"/> + </repeat> + <repeat name="in_seqs"> + <param name="label" value="simHuman_chr6"/> + <param name="fasta" value="simHuman_chr6.fasta"/> + </repeat> + <repeat name="in_seqs"> + <param name="label" value="simMouse_chr6"/> + <param name="fasta" value="simMouse_chr6.fasta"/> + </repeat> + <repeat name="in_seqs"> + <param name="label" value="simRat_chr6"/> + <param name="fasta" value="simRat_chr6.fasta"/> + </repeat> + <output name="out_hal"> + <assert_contents> + <has_size value="2119332" delta="200000" /> + </assert_contents> + </output> + </test> + <!-- compressed input --> + <test expect_num_outputs="1"> + <conditional name="aln_mode"> + <param name="aln_mode_select" value="intraspecies"/> + </conditional> + <repeat name="in_seqs"> + <param name="label" value="germ_25"/> + <param name="fasta" value="germ_25.fasta.gz"/> + </repeat> + <repeat name="in_seqs"> + <param name="label" value="vulg_25"/> + <param name="fasta" value="vulg_25.fasta.gz"/> + </repeat> + <repeat name="in_seqs"> + <param name="label" value="pens_25"/> + <param name="fasta" value="pens_25.fasta.gz"/> + </repeat> + <output name="out_hal"> + <assert_contents> + <has_size value="7204260" delta="200000" /> + </assert_contents> + </output> + </test> + <!-- FASTA header --> + <test expect_exit_code="1" expect_failure="true"> + <conditional name="aln_mode"> + <param name="aln_mode_select" value="intraspecies"/> + </conditional> + <repeat name="in_seqs"> + <param name="label" value="badheader1"/> + <param name="fasta" value="bh1.fasta.gz"/> + </repeat> + <repeat name="in_seqs"> + <param name="label" value="badheader2"/> + <param name="fasta" value="bh2.fasta.gz"/> + </repeat> + </test> + </tests> + <help><![CDATA[ +**What it does** + +`Cactus <https://github.com/ComparativeGenomicsToolkit/cactus>`__ is a +reference-free whole-genome multiple alignment program. It can be used +to progressively align a large number of genomes. + +**Usage** + +**Between-species mode** + +If you are aligning genomes from **multiple species**, you need to +provide a guide tree in Newick format. Cactus uses the guide tree to +progressively align genomes, meaning that it doesn’t need to align all +possible pairs of genomes. + +A Newick-formatted tree for human, chimp and gorilla genomes looks like +this: + +:: + + (((human:0.006,chimp:0.006667):0.0022,gorilla:0.008825):0.0096,orang:0.01831); + +The numbers are the branch lengths. + +**Beta: Within-species mode** + +You can also run Cactus in `pangenome +mode <https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/pangenome.md>`__ +to align genomes of multiple individuals from the **same species**. In +this mode you will not use a guide tree. Cactus will use +`minigraph <https://github.com/lh3/minigraph>`__ to generate a graph of +the input genomes and then use the graph to order the alignments. To use +pangenome mode, select ‘Within-species’ in the ‘Alignment mode’ +dropdown. + +⚠️ To use pangenome mode, you will have to remove spaces from the headers in your FASTA file. +You can do this with the NormalizeFasta tool. + +**Input** + +The developers recommend soft-masking your genomes with RepeatMasker +before running Cactus. RepeatMasker is available on Galaxy. + +If you’re using Between-species mode, you need to provide labels for the +fasta files that match the leaves on the guide tree. In the example +above, you would use the label ‘human’ for the human fasta file. + +**Output** + +The main output of Cactus is in `HAL +format <https://github.com/ComparativeGenomicsToolkit/cactus#using-the-output>`__. +You can use the `Cactus: export <root?tool_id=cactus_export>`__ tool to +convert the Cactus output to a VG or Multiple Alignment Format (MAF) +file. + + + ]]></help> + <expand macro="citations"/> +</tool>