diff cactus_cactus.xml @ 4:3c8227556fdc draft

planemo upload for repository https://github.com/usegalaxy-au/tools-au commit 2b4c7c63b0e4a1f730794a4a5825bce29ee2eb25
author galaxy-australia
date Wed, 09 Nov 2022 03:14:17 +0000
parents 9422c5a87ee2
children 48c13389050d
line wrap: on
line diff
--- a/cactus_cactus.xml	Tue Sep 20 05:38:05 2022 +0000
+++ b/cactus_cactus.xml	Wed Nov 09 03:14:17 2022 +0000
@@ -6,17 +6,17 @@
     <expand macro="xrefs"/>
     <expand macro="requirements"/>
     <command detect_errors="exit_code"><![CDATA[
+        export TMPDIR=\${_GALAXY_JOB_TMP_DIR} &&
+
         ## Set up seqfile
         
         #if $aln_mode.aln_mode_select == 'interspecies':
             cat $aln_mode.in_tree >> seqfile.txt &&
         #end if
-        #set seq_line = ''
         #for $seq in $in_seqs:
             #set seq_fn = str($seq.label) + '.' + $seq.fasta.ext
             ln -s '$seq.fasta' '$seq_fn' &&
             printf '%s %s\n' '$seq.label' '$seq_fn' >> seqfile.txt 
-            #set seq_line += $seq_fn + ' '
             &&
         #end for
 
@@ -24,40 +24,61 @@
 
         #if $aln_mode.aln_mode_select == 'intraspecies':
             ## If we're doing a pangenome, we need to run the steps manually
-            minigraph -xggs
-            -t \${GALAXY_SLOTS:-4}
-            $seq_line
-            > pangenome.gfa
+            cactus-minigraph 
+            --binariesMode local
+            --mapCores \${GALAXY_SLOTS:-4}
+            --maxMemory \${GALAXY_MEMORY_MB:-8192}M
+            --reference $aln_mode.ref_level
+            --workDir ./
+            ./jobStore
+            ./seqfile.txt
+            pangenome.gfa
             &&
             cactus-graphmap
+            --binariesMode local
             --maxCores  \${GALAXY_SLOTS:-4}
             --maxMemory \${GALAXY_MEMORY_MB:-8192}M
+            --outputFasta pangenome.gfa.fa
+            --reference $aln_mode.ref_level
+            --workDir ./
             ./jobStore
             ./seqfile.txt
             pangenome.gfa
             pangenome.paf 
-            --outputFasta pangenome.gfa.fa
-            --binariesMode local
-            --workDir ./
             &&
             cactus-align
+            --binariesMode local
             --maxCores  \${GALAXY_SLOTS:-4}
             --maxMemory \${GALAXY_MEMORY_MB:-8192}M
+            --outVG
+            --pangenome
+            --reference $aln_mode.ref_level
+            --workDir ./
             ./jobStore
             ./seqfile.txt
             pangenome.paf 
             alignment.hal
-            --pangenome
+            &&
+            cactus-graphmap-join
             --binariesMode local
-            --workDir ./
+            --gfaffix 
+            --giraffe
+            --maxCores  \${GALAXY_SLOTS:-4}
+            --maxMemory \${GALAXY_MEMORY_MB:-8192}M
+            --outDir ./
+            --outName alignment
+            --reference $aln_mode.ref_level
+            --vg alignment.vg
+            --wlineSep "." 
+            ./jobStore
         #else if $aln_mode.aln_mode_select == 'interspecies':
             ## Run cactus normally
             cactus 
+            --binariesMode local 
             --maxCores  \${GALAXY_SLOTS:-4}
             --maxMemory \${GALAXY_MEMORY_MB:-8192}M
+            --workDir ./
             jobStore seqfile.txt alignment.hal 
-            --binariesMode local 
-            --workDir ./
         #end if
 
     ]]></command>
@@ -71,6 +92,7 @@
                 <param name="in_tree" type="data" format="nhx" label="Guide tree" help="Phylogenetic tree in Newick format. Required by Cactus to achieve linear scaling with number of input genomes" />
             </when>
             <when value="intraspecies">
+                <param name="ref_level" type="text" value="" label="Reference genome" help="Pangenomes from Minigraph-Cactus depend on a predetermined reference genome. Specify one of the Input Genomes as the reference genome. This must match the label used in 'Genome Label'." />
             </when>
         </conditional>
         <repeat name="in_seqs" title="Input genome">
@@ -83,10 +105,13 @@
     </inputs>
     <outputs>
         <data name="out_hal" format="h5" from_work_dir="alignment.hal" label="${tool.name} on ${on_string} (HAL file)" />
+        <data name="out_gfa" format="gfa2.gz" from_work_dir="alignment.gfa.gz" label="${tool.name} on ${on_string} (GFA file)" >
+            <filter>aln_mode[aln_mode_select] == 'intraspecies'</filter>
+        </data>
     </outputs>
     <tests>
         <!-- test interspecies mode -->
-        <test expect_num_outputs="1">
+        <test expect_num_outputs="2">
             <conditional name="aln_mode">
                 <param name="aln_mode_select" value="interspecies"/>
                 <param name="in_tree" value="test_tree.nhx"/>
@@ -118,9 +143,10 @@
             </output>
         </test>
         <!-- within-species mode -->
-        <test expect_num_outputs="1">
+        <test expect_num_outputs="2">
             <conditional name="aln_mode">
                 <param name="aln_mode_select" value="intraspecies"/>
+                <param name="ref_level" value="simCow_chr6"/>
             </conditional>
             <repeat name="in_seqs">
                 <param name="label" value="simCow_chr6"/>
@@ -144,14 +170,20 @@
             </repeat>
             <output name="out_hal">
                 <assert_contents>
-                    <has_size value="1349620" delta="200000" />
+                    <has_size value="2088959" delta="200000" />
+                </assert_contents>
+            </output>
+            <output name="out_gfa">
+                <assert_contents>
+                    <has_size value="173000" delta="200000" />
                 </assert_contents>
             </output>
         </test>
         <!-- compressed input -->
-        <test expect_num_outputs="1">
+        <test expect_num_outputs="2">
             <conditional name="aln_mode">
                 <param name="aln_mode_select" value="intraspecies"/>
+                <param name="ref_level" value="germ_25"/>
             </conditional>
             <repeat name="in_seqs">
                 <param name="label" value="germ_25"/>
@@ -170,11 +202,17 @@
                     <has_size value="7420424" delta="200000" />
                 </assert_contents>
             </output>
+            <output name="out_gfa">
+                <assert_contents>
+                    <has_size value="6710429" delta="200000" />
+                </assert_contents>
+            </output>
         </test>
         <!-- FASTA header with spaces (used to fail) -->
-        <test expect_num_outputs="1">
+        <test expect_num_outputs="2">
             <conditional name="aln_mode">
                 <param name="aln_mode_select" value="intraspecies"/>
+                <param name="ref_level" value="badheader1"/>
             </conditional>
             <repeat name="in_seqs">
                 <param name="label" value="badheader1"/>
@@ -189,6 +227,11 @@
                     <has_size value="3382274" delta="200000" />
                 </assert_contents>
             </output>
+            <output name="out_gfa">
+                <assert_contents>
+                    <has_size value="764748" delta="200000" />
+                </assert_contents>
+            </output>
         </test>
     </tests>
         <help><![CDATA[
@@ -200,7 +243,7 @@
 
 **Usage**
 
-**Between-species mode**
+**Between-species mode (Progressive Cactus)**
 
 If you are aligning genomes from **multiple species**, you need to
 provide a guide tree in Newick format. Cactus uses the guide tree to
@@ -216,7 +259,7 @@
 
 The numbers are the branch lengths.
 
-**Beta: Within-species mode**
+**Within-species mode (Minigraph-Cactus)**
 
 You can also run Cactus in `pangenome
 mode <https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/pangenome.md>`__
@@ -227,6 +270,8 @@
 pangenome mode, select ‘Within-species’ in the ‘Alignment mode’
 dropdown.
 
+Unlike Between-species mode, Within-species mode depends on a predetermined reference genome
+
 **Input**
 
 The developers recommend soft-masking your genomes with RepeatMasker