comparison cactus_cactus.xml @ 0:85f68b344286 draft

"planemo upload for repository https://github.com/usegalaxy-au/tools-au commit 8f8363625623f2ff3f04d12d227673ac134eba24"
author galaxy-australia
date Mon, 04 Apr 2022 06:27:44 +0000
parents
children 1bc1199f0ff4
comparison
equal deleted inserted replaced
-1:000000000000 0:85f68b344286
1 <tool id="cactus_cactus" name="Cactus" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@" license="MIT">
2 <description>whole-genome multiple sequence alignment.</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="xrefs"/>
7 <expand macro="requirements"/>
8 <command detect_errors="exit_code"><![CDATA[
9 ## Check the FASTA headers
10 ## This is only necessary in pangenome mode
11 #if $aln_mode.aln_mode_select == 'intraspecies':
12 #for $seq in $in_seqs:
13 if
14 #if $seq.fasta.is_of_type('fasta.gz'):
15 zgrep
16 #else
17 grep
18 #end if
19 "^>" $seq.fasta | grep -q "[[:space:]]" ; then
20 echo "Error parsing input FASTA." ;
21 echo "Pangenome mode fails if there are spaces in the header." ;
22 echo "Please remove them with the NormalizeFasta tool." ;
23 exit 1
24 ; fi &&
25 #end for
26 #end if
27
28 ## Set up seqfile
29
30 #if $aln_mode.aln_mode_select == 'interspecies':
31 cat $aln_mode.in_tree >> seqfile.txt &&
32 #end if
33 #set seq_line = ''
34 #for $seq in $in_seqs:
35 #set seq_fn = str($seq.label) + '.' + $seq.fasta.ext
36 ln -s '$seq.fasta' '$seq_fn' &&
37 printf '%s %s\n' '$seq.label' '$seq_fn' >> seqfile.txt
38 #set seq_line += $seq_fn + ' '
39 &&
40 #end for
41
42 ## Run cactus
43
44 #if $aln_mode.aln_mode_select == 'intraspecies':
45 ## If we're doing a pangenome, we need to run the steps manually
46 minigraph -xggs
47 -t \${GALAXY_SLOTS:-4}
48 $seq_line
49 > pangenome.gfa
50 &&
51 cactus-graphmap
52 --maxCores \${GALAXY_SLOTS:-4}
53 --maxMemory \${GALAXY_MEMORY_MB:-8192}M
54 ./jobStore
55 ./seqfile.txt
56 pangenome.gfa
57 pangenome.paf
58 --outputFasta pangenome.gfa.fa
59 --binariesMode local
60 --workDir ./
61 &&
62 cactus-align
63 --maxCores \${GALAXY_SLOTS:-4}
64 --maxMemory \${GALAXY_MEMORY_MB:-8192}M
65 ./jobStore
66 ./seqfile.txt
67 pangenome.paf
68 alignment.hal
69 --pangenome
70 --pafInput
71 --binariesMode local
72 --workDir ./
73 #else if $aln_mode.aln_mode_select == 'interspecies':
74 ## Run cactus normally
75 cactus
76 --maxCores \${GALAXY_SLOTS:-4}
77 --maxMemory \${GALAXY_MEMORY_MB:-8192}M
78 jobStore seqfile.txt alignment.hal
79 --binariesMode local
80 --workDir ./
81 #end if
82
83 ]]></command>
84 <inputs>
85 <conditional name="aln_mode">
86 <param name="aln_mode_select" type="select" label="Alignment mode" help="The taxonomic relationship between input genomes. If genomes are from multiple individuals of the same species, select 'Within-species'">
87 <option value="interspecies" selected="true">Between-species</option>
88 <option value="intraspecies">Within-species</option>
89 </param>
90 <when value="interspecies">
91 <param name="in_tree" type="data" format="nhx" label="Guide tree" help="Phylogenetic tree in Newick format. Required by Cactus to achieve linear scaling with number of input genomes" />
92 </when>
93 <when value="intraspecies">
94 </when>
95 </conditional>
96 <repeat name="in_seqs" title="Input genome">
97 <param name="label" type="text" value="" label="Genome Label" help="NO SPACES. Must match a label in the guide tree.">
98 </param>
99 <param name="fasta" type="data" format="fasta,fasta.gz" label="Genome Sequence" help="Input genome"/>
100 </repeat>
101 <!-- add an option for root -->
102 <!-- root mr -->
103 </inputs>
104 <outputs>
105 <data name="out_hal" format="h5" from_work_dir="alignment.hal" label="${tool.name} on ${on_string} (HAL file)" />
106 </outputs>
107 <tests>
108 <!-- test interspecies mode -->
109 <test expect_num_outputs="1">
110 <conditional name="aln_mode">
111 <param name="aln_mode_select" value="interspecies"/>
112 <param name="in_tree" value="test_tree.nhx"/>
113 </conditional>
114 <repeat name="in_seqs">
115 <param name="label" value="simCow_chr6"/>
116 <param name="fasta" value="simCow_chr6.fasta"/>
117 </repeat>
118 <repeat name="in_seqs">
119 <param name="label" value="simDog_chr6"/>
120 <param name="fasta" value="simDog_chr6.fasta"/>
121 </repeat>
122 <repeat name="in_seqs">
123 <param name="label" value="simHuman_chr6"/>
124 <param name="fasta" value="simHuman_chr6.fasta"/>
125 </repeat>
126 <repeat name="in_seqs">
127 <param name="label" value="simMouse_chr6"/>
128 <param name="fasta" value="simMouse_chr6.fasta"/>
129 </repeat>
130 <repeat name="in_seqs">
131 <param name="label" value="simRat_chr6"/>
132 <param name="fasta" value="simRat_chr6.fasta"/>
133 </repeat>
134 <output name="out_hal">
135 <assert_contents>
136 <has_size value="5272354" delta="200000" />
137 </assert_contents>
138 </output>
139 </test>
140 <!-- within-species mode -->
141 <test expect_num_outputs="1">
142 <conditional name="aln_mode">
143 <param name="aln_mode_select" value="intraspecies"/>
144 </conditional>
145 <repeat name="in_seqs">
146 <param name="label" value="simCow_chr6"/>
147 <param name="fasta" value="simCow_chr6.fasta"/>
148 </repeat>
149 <repeat name="in_seqs">
150 <param name="label" value="simDog_chr6"/>
151 <param name="fasta" value="simDog_chr6.fasta"/>
152 </repeat>
153 <repeat name="in_seqs">
154 <param name="label" value="simHuman_chr6"/>
155 <param name="fasta" value="simHuman_chr6.fasta"/>
156 </repeat>
157 <repeat name="in_seqs">
158 <param name="label" value="simMouse_chr6"/>
159 <param name="fasta" value="simMouse_chr6.fasta"/>
160 </repeat>
161 <repeat name="in_seqs">
162 <param name="label" value="simRat_chr6"/>
163 <param name="fasta" value="simRat_chr6.fasta"/>
164 </repeat>
165 <output name="out_hal">
166 <assert_contents>
167 <has_size value="2119332" delta="200000" />
168 </assert_contents>
169 </output>
170 </test>
171 <!-- compressed input -->
172 <test expect_num_outputs="1">
173 <conditional name="aln_mode">
174 <param name="aln_mode_select" value="intraspecies"/>
175 </conditional>
176 <repeat name="in_seqs">
177 <param name="label" value="germ_25"/>
178 <param name="fasta" value="germ_25.fasta.gz"/>
179 </repeat>
180 <repeat name="in_seqs">
181 <param name="label" value="vulg_25"/>
182 <param name="fasta" value="vulg_25.fasta.gz"/>
183 </repeat>
184 <repeat name="in_seqs">
185 <param name="label" value="pens_25"/>
186 <param name="fasta" value="pens_25.fasta.gz"/>
187 </repeat>
188 <output name="out_hal">
189 <assert_contents>
190 <has_size value="7204260" delta="200000" />
191 </assert_contents>
192 </output>
193 </test>
194 <!-- FASTA header -->
195 <test expect_exit_code="1" expect_failure="true">
196 <conditional name="aln_mode">
197 <param name="aln_mode_select" value="intraspecies"/>
198 </conditional>
199 <repeat name="in_seqs">
200 <param name="label" value="badheader1"/>
201 <param name="fasta" value="bh1.fasta.gz"/>
202 </repeat>
203 <repeat name="in_seqs">
204 <param name="label" value="badheader2"/>
205 <param name="fasta" value="bh2.fasta.gz"/>
206 </repeat>
207 </test>
208 </tests>
209 <help><![CDATA[
210 **What it does**
211
212 `Cactus <https://github.com/ComparativeGenomicsToolkit/cactus>`__ is a
213 reference-free whole-genome multiple alignment program. It can be used
214 to progressively align a large number of genomes.
215
216 **Usage**
217
218 **Between-species mode**
219
220 If you are aligning genomes from **multiple species**, you need to
221 provide a guide tree in Newick format. Cactus uses the guide tree to
222 progressively align genomes, meaning that it doesn’t need to align all
223 possible pairs of genomes.
224
225 A Newick-formatted tree for human, chimp and gorilla genomes looks like
226 this:
227
228 ::
229
230 (((human:0.006,chimp:0.006667):0.0022,gorilla:0.008825):0.0096,orang:0.01831);
231
232 The numbers are the branch lengths.
233
234 **Beta: Within-species mode**
235
236 You can also run Cactus in `pangenome
237 mode <https://github.com/ComparativeGenomicsToolkit/cactus/blob/master/doc/pangenome.md>`__
238 to align genomes of multiple individuals from the **same species**. In
239 this mode you will not use a guide tree. Cactus will use
240 `minigraph <https://github.com/lh3/minigraph>`__ to generate a graph of
241 the input genomes and then use the graph to order the alignments. To use
242 pangenome mode, select ‘Within-species’ in the ‘Alignment mode’
243 dropdown.
244
245 ⚠️ To use pangenome mode, you will have to remove spaces from the headers in your FASTA file.
246 You can do this with the NormalizeFasta tool.
247
248 **Input**
249
250 The developers recommend soft-masking your genomes with RepeatMasker
251 before running Cactus. RepeatMasker is available on Galaxy.
252
253 If you’re using Between-species mode, you need to provide labels for the
254 fasta files that match the leaves on the guide tree. In the example
255 above, you would use the label ‘human’ for the human fasta file.
256
257 **Output**
258
259 The main output of Cactus is in `HAL
260 format <https://github.com/ComparativeGenomicsToolkit/cactus#using-the-output>`__.
261 You can use the `Cactus: export <root?tool_id=cactus_export>`__ tool to
262 convert the Cactus output to a VG or Multiple Alignment Format (MAF)
263 file.
264
265
266 ]]></help>
267 <expand macro="citations"/>
268 </tool>