comparison macros.xml @ 0:b59ae99e47d4 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/drep commit b155a1d533b7317ceb0ec642ffe3e986117df539"
author jjohnson
date Mon, 06 Jan 2020 11:11:06 -0500
parents
children 7e2debc267eb
comparison
equal deleted inserted replaced
-1:000000000000 0:b59ae99e47d4
1 <macros>
2 <token name="@VERSION@">2.3.2</token>
3 <xml name="requirements">
4 <requirements>
5 <requirement type="package" version="@VERSION@">drep</requirement>
6 <yield/>
7 </requirements>
8 </xml>
9 <xml name="citations">
10 <citations>
11 <citation type="doi">10.1038/ismej.2017.126</citation>
12 <yield />
13 </citations>
14 </xml>
15
16
17 <xml name="genomes">
18 <param argument="--genomes" type="data" format="fasta" label="genomes fasta files" multiple="true"/>
19 </xml>
20 <token name="@PREPARE_GENOMES@"><![CDATA[
21 #import re
22 #set $genomefiles = []
23 #for $genome in $genomes
24 #set $input_name = $re.sub('[^\w\-_.]', '_',str($genome.element_identifier.split('/')[-1]))
25 ln -s '${genome}' '${input_name}' &&
26 $genomefiles.append($input_name)
27 #end for
28 ]]></token>
29 <token name="@GENOMES@"><![CDATA[
30 -g
31 #for $genomefile in $genomefiles
32 '${genomefile}'
33 #end for
34 ]]></token>
35
36
37 <xml name="checkm_method">
38 <param argument="--checkM_method" type="select" label="checkm method" optional="true">
39 <option value="lineage_wf">lineage_wf (more accurate)</option>
40 <option value="taxonomy_wf">taxonomy_wf (faster)</option>
41 </param>
42 </xml>
43 <token name="@CHECKM_METHOD@"><![CDATA[
44 #if $checkM_method:
45 --checkM_method $checkM_method
46 #end if
47 ]]></token>
48
49 <xml name="filtering_options">
50 <conditional name="filter">
51 <param name="set_options" type="select" label="set filtering options">
52 <option value="yes">Yes</option>
53 <option value="no" selected="true">No</option>
54 </param>
55 <when value="yes">
56 <param argument="--length" type="integer" value="50000" label="Minimum genome length"/>
57 <param argument="--completeness" type="integer" value="75" min="0" max="100" label="Minimum genome completeness percent"/>
58 <param argument="--contamination" type="integer" value="25" min="0" max="100" label="Maximum genome contamination percent"/>
59
60 <conditional name="quality">
61 <param argument="source" type="select" label="genome quality">
62 <help>
63 --ignoreGenomeQuality is useful with
64 bacteriophages or eukaryotes or things where checkM
65 scoring does not work. Will only choose genomes based
66 on length and N50.
67 </help>
68 <option value="checkm" selected="true">Run checkM</option>
69 <option value="genomeInfo">User supplied genomeInfo csv file</option>
70 <option value="ignoreGenomeQuality">--ignoreGenomeQuality (NOT RECOMMENDED!)</option>
71 </param>
72 <when value="checkm">
73 <param argument="--checkM_method" type="select" label="checkm method" optional="true">
74 <option value="lineage_wf">lineage_wf (more accurate)</option>
75 <option value="taxonomy_wf">taxonomy_wf (faster)</option>
76 </param>
77 </when>
78 <when value="genomeInfo">
79 <param argument="--genomeInfo" type="data" format="csv" label="genomes fasta files">
80 <help><![CDATA[
81 A CSV dataset that must contain: [
82 "genome"(history dataset name of .fasta dataset of that genome),
83 "completeness"(0-100 value for completeness of the genome),
84 "contamination"(0-100 value of the contamination of the genome)]
85 ]]></help>
86 </param>
87 </when>
88 <when value="ignoreGenomeQuality"/>
89 </conditional>
90
91 </when>
92 <when value="no"/>
93 </conditional>
94 </xml>
95 <token name="@FILTER_OPTIONS@"><![CDATA[
96 #if $filter.set_options == 'yes':
97 --length $filter.length
98 --completeness $filter.completeness
99 --contamination $filter.contamination
100 #if $filter.quality.source == 'checkm'
101 --checkM_method $filter.quality.checkM_method
102 #elif $filter.quality.source == 'genomeInfo'
103 --genomeInfo $filter.quality.genomeInfo
104 #elif $filter.quality.source == 'ignoreGenomeQuality'
105 --ignoreGenomeQuality
106 #end if
107 #end if
108 ]]></token>
109
110 <xml name="genome_comparison_options">
111 <conditional name="genome_comparison">
112 <param name="set_options" type="select" label="set genome comparison options">
113 <option value="yes">Yes</option>
114 <option value="no" selected="true">No</option>
115 </param>
116 <when value="yes">
117 <param argument="--MASH_sketch" type="integer" value="1000" label="MASH sketch size"/>
118 <param argument="--S_algorithm" type="select" label="Algorithm for secondary clustering comaprisons">
119 <option value="ANImf" selected="true">ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions</option>
120 <option value="ANIn">ANIn = Align whole genomes with nucmer; compare aligned regions</option>
121 <option value="gANI">gANI = Identify and align ORFs; compare aligned ORFS</option>
122 </param>
123 <param argument="-n_PRESET" type="select" label="Presets to pass to nucmer">
124 <option value="normal" selected="true">normal = default ANIn parameters (default: normal)</option>
125 <option value="tight">tight = only align highly conserved regions</option>
126 </param>
127 </when>
128 <when value="no"/>
129 </conditional>
130 </xml>
131 <token name="@GENOME_COMPARISON_OPTIONS@"><![CDATA[
132 #if $genome_comparison.set_options == 'yes':
133 --MASH_sketch $genome_comparison.MASH_sketch
134 --S_algorithm $genome_comparison.S_algorithm
135 -n_PRESET $genome_comparison.n_PRESET
136 #end if
137 ]]></token>
138
139 <xml name="clustering_options">
140 <conditional name="clustering">
141 <param name="set_options" type="select" label="set clustering options">
142 <option value="yes">Yes</option>
143 <option value="no" selected="true">No</option>
144 </param>
145 <when value="yes">
146 <param argument="--P_ani" type="float" value="0.9" min="0." max="1." label="ANI threshold to form primary (MASH) clusters"/>
147 <param argument="--S_ani" type="float" value="0.99" min="0." max="1." label="ANI threshold to form secondary clusters"/>
148
149 <param argument="--SkipMash" type="boolean" truevalue="--SkipMash" falsevalue="" checked="false" label="Skip MASH clustering, just do secondary clustering on all genomes"/>
150 <param argument="--SkipSecondary" type="boolean" truevalue="--SkipSecondary" falsevalue="" checked="false" label="Skip secondary clustering, just perform MASH clustering"/>
151 <param argument="--cov_thresh" type="float" value="0.1" min="0." max="1." label="Minmum level of overlap between genomes when doing secondary comparisons"/>
152 <param argument="--coverage_method" type="select" label="Method to calculate coverage of an alignment">
153 <help>(for ANIn/ANImf only; gANI can only do larger method)</help>
154 <option value="larger" selected="true">arger = max((aligned length / genome 1), (aligned_length / genome2))</option>
155 <option value="total">total = 2*(aligned length) / (sum of total genome lengths)</option>
156 </param>
157 <param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes">
158 <help>(passed to scipy.cluster.hierarchy.linkage)</help>
159 <option value="average" selected="true">average</option>
160 </param>
161 </when>
162 <when value="no"/>
163 </conditional>
164 </xml>
165 <token name="@CLUSTERING_OPTIONS@"><![CDATA[
166 #if $clustering.set_options == 'yes':
167 --P_ani $clustering.P_ani
168 --S_ani $clustering.S_ani
169 $clustering.SkipMash
170 $clustering.SkipSecondary
171 --cov_thresh $clustering.cov_thresh
172 --coverage_method $clustering.coverage_method
173 --clusterAlg $clustering.clusterAlg
174 #end if
175 ]]></token>
176
177 <xml name="scoring_options">
178 <conditional name="scoring">
179 <param name="set_options" type="select" label="set scoring options">
180 <option value="yes">Yes</option>
181 <option value="no" selected="true">No</option>
182 </param>
183 <when value="yes">
184 <param argument="--completeness_weight" type="float" value="1" label="completeness weight">
185 <help>
186 Based off of the formula:
187 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size)
188 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight;
189 </help>
190 </param>
191 <param argument="--contamination_weight" type="float" value="5" label="contamination weight"/>
192 <param argument="--strain_heterogeneity_weight" type="float" value="1" min="0." max="1." label="strain heterogeneity weight"/>
193 <param argument="--N50_weight" type="float" value=".5" label="weight of log(genome N50)"/>
194 <param argument="--size_weight" type="float" value="0" label="weight of log(genome size)"/>
195 </when>
196 <when value="no"/>
197 </conditional>
198 </xml>
199 <token name="@SCORING_OPTIONS@"><![CDATA[
200 #if $scoring.set_options == 'yes':
201 --completeness_weight $scoring.completeness_weight
202 --contamination_weight $scoring.contamination_weight
203 --strain_heterogeneity_weight $scoring.strain_heterogeneity_weight
204 --N50_weight $scoring.N50_weight
205 --size_weight $scoring.size_weight
206 #end if
207 ]]></token>
208
209 <xml name="taxonomy_options">
210 <conditional name="taxonomy">
211 <param name="set_options" type="select" label="generate taxonomy information">
212 <option value="yes">Yes</option>
213 <option value="no" selected="true">No</option>
214 </param>
215 <when value="yes">
216 <param argument="--tax_method" type="select" label="Method of determining taxonomy">
217 <help>(for ANIn/ANImf only; gANI can only do larger method)</help>
218 <option value="percent" selected="true">percent = The most descriptive taxonimic level with at least (per) hits</option>
219 <option value="max">max = The centrifuge taxonomic level with the most overall hits</option>
220 </param>
221 <param argument="--percent" type="float" value="50" min="0" max="100" label="minimum percent for percent method"/>
222 <param argument="--cent_index" type="data" format="" label="centrifuge index"/>
223 </when>
224 <when value="no"/>
225 </conditional>
226 </xml>
227 <token name="@TAXONOMY_OPTIONS@"><![CDATA[
228 #if $taxonomy.set_options == 'yes':
229 --run_tax
230 --tax_method $taxonomy.tax_method
231 --percent $taxonomy.percent
232 --cent_index $taxonomy.cent_index
233 #end if
234 ]]></token>
235
236 <xml name="warning_options">
237 <conditional name="warning">
238 <param name="set_options" type="select" label="set warning options">
239 <option value="yes">Yes</option>
240 <option value="no" selected="true">No</option>
241 </param>
242 <when value="yes">
243 <param argument="--warn_dist" type="float" value="0.25" min="0" max="1" label="How far from the threshold to throw cluster warnings"/>
244 <param argument="--warn_sim" type="float" value="0.98" min="0" max="1" label="Similarity threshold for warnings between dereplicated genomes"/>
245 <param argument="--warn_aln" type="float" value="0.25" min="0" max="1" label="Minimum aligned fraction for warnings between dereplicated genomes (ANIn)"/>
246 </when>
247 <when value="no"/>
248 </conditional>
249 </xml>
250 <token name="@WARNING_OPTIONS@"><![CDATA[
251 #if $warning.set_options == 'yes':
252 --warn_dist $warning.warn_dist
253 --warn_sim $warning.warn_sim
254 --warn_aln $warning.warn_aln
255 #end if
256 ]]></token>
257
258 <xml name="select_outputs">
259 </xml>
260
261 <xml name="common_outputs">
262 <data name="log" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/logger.log"/>
263 <data name="warnings" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/warnings.txt"/>
264 <data name="Primary_clustering_dendrogram" format="pdf" label="${tool.name} on ${on_string}: Primary_clustering_dendrogram.pdf" from_work_dir="outdir/figures/Primary_clustering_dendrogram.pdf"/>
265 <data name="Secondary_clustering_dendrograms" format="pdf" label="${tool.name} on ${on_string}: Secondary_clustering_dendrograms.pdf" from_work_dir="outdir/figures/Secondary_clustering_dendrograms.pdf"/>
266 <data name="Secondary_clustering_MDS" format="pdf" label="${tool.name} on ${on_string}: Secondary_clustering_MDS.pdf" from_work_dir="outdir/figures/Secondary_clustering_MDS.pdf"/>
267 <data name="Clustering_scatterplots" format="pdf" label="${tool.name} on ${on_string}: Clustering_scatterplots.pdf" from_work_dir="outdir/figures/Clustering_scatterplots.pdf"/>
268 </xml>
269 <xml name="common_outputs2">
270 </xml>
271
272 <token name="@GENOMES_HELP@"><![CDATA[
273 I/O PARAMETERS:
274 -g [GENOMES [GENOMES ...]], --genomes [GENOMES [GENOMES ...]]
275 genomes to cluster in .fasta format (default: None)
276 ]]></token>
277
278 <token name="@FILTERING_HELP@"><![CDATA[
279 FILTERING OPTIONS:
280 -l LENGTH, --length LENGTH
281 Minimum genome length (default: 50000)
282 -comp COMPLETENESS, --completeness COMPLETENESS
283 Minumum genome completeness (default: 75)
284 -con CONTAMINATION, --contamination CONTAMINATION
285 Maximum genome contamination (default: 25)
286 --ignoreGenomeQuality
287 Don't run checkM or do any quality filtering. NOT
288 RECOMMENDED! This is useful for use with
289 bacteriophages or eukaryotes or things where checkM
290 scoring does not work. Will only choose genomes based
291 on length and N50 (default: False)
292
293
294 ]]></token>
295
296 <token name="@GENOME_COMPARISON_HELP@"><![CDATA[
297 GENOME COMPARISON PARAMETERS:
298 -ms MASH_SKETCH, --MASH_sketch MASH_SKETCH
299 MASH sketch size (default: 1000)
300 --S_algorithm {goANI,ANIn,ANImf,gANI}
301 Algorithm for secondary clustering comaprisons:
302 ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions
303 ANIn = Align whole genomes with nucmer; compare aligned regions
304 gANI = Identify and align ORFs; compare aligned ORFS
305 (default: ANImf)
306 -n_PRESET {normal,tight}
307 Presets to pass to nucmer
308 tight = only align highly conserved regions
309 normal = default ANIn parameters (default: normal)
310
311 ]]></token>
312
313 <token name="@CLUSTERING_HELP@"><![CDATA[
314 CLUSTERING PARAMETERS:
315 -pa P_ANI, --P_ani P_ANI
316 ANI threshold to form primary (MASH) clusters
317 (default: 0.9)
318 -sa S_ANI, --S_ani S_ANI
319 ANI threshold to form secondary clusters (default:
320 0.99)
321 --SkipMash Skip MASH clustering, just do secondary clustering on
322 all genomes (default: False)
323 --SkipSecondary Skip secondary clustering, just perform MASH
324 clustering (default: False)
325 -nc COV_THRESH, --cov_thresh COV_THRESH
326 Minmum level of overlap between genomes when doing
327 secondary comparisons (default: 0.1)
328 -cm {total,larger}, --coverage_method {total,larger}
329 Method to calculate coverage of an alignment
330 (for ANIn/ANImf only; gANI can only do larger method)
331 total = 2*(aligned length) / (sum of total genome lengths)
332 larger = max((aligned length / genome 1), (aligned_length / genome2))
333 (default: larger)
334 --clusterAlg CLUSTERALG
335 Algorithm used to cluster genomes (passed to
336 scipy.cluster.hierarchy.linkage (default: average)
337
338 ]]></token>
339
340 <token name="@SCORING_HELP@"><![CDATA[
341 SCORING CRITERIA
342 Based off of the formula:
343 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size)
344
345 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight:
346 -comW COMPLETENESS_WEIGHT, --completeness_weight COMPLETENESS_WEIGHT
347 completeness weight (default: 1)
348 -conW CONTAMINATION_WEIGHT, --contamination_weight CONTAMINATION_WEIGHT
349 contamination weight (default: 5)
350 -strW STRAIN_HETEROGENEITY_WEIGHT, --strain_heterogeneity_weight STRAIN_HETEROGENEITY_WEIGHT
351 strain heterogeneity weight (default: 1)
352 -N50W N50_WEIGHT, --N50_weight N50_WEIGHT
353 weight of log(genome N50) (default: 0.5)
354 -sizeW SIZE_WEIGHT, --size_weight SIZE_WEIGHT
355 weight of log(genome size) (default: 0)
356
357 ]]></token>
358
359 <token name="@TAXONOMY_HELP@"><![CDATA[
360 TAXONOMY:
361 --run_tax generate taxonomy information (Tdb) (default: False)
362 --tax_method {percent,max}
363 Method of determining taxonomy
364 percent = The most descriptive taxonimic level with at least (per) hits
365 max = The centrifuge taxonomic level with the most overall hits (default: percent)
366 -per PERCENT, --percent PERCENT
367 minimum percent for percent method (default: 50)
368 --cent_index CENT_INDEX
369 path to centrifuge index (for example,
370 /home/mattolm/download/centrifuge/indices/b+h+v
371 (default: None)
372
373 ]]></token>
374
375 <token name="@WARNINGS_HELP@"><![CDATA[
376 WARNINGS:
377 --warn_dist WARN_DIST
378 How far from the threshold to throw cluster warnings
379 (default: 0.25)
380 --warn_sim WARN_SIM Similarity threshold for warnings between dereplicated
381 genomes (default: 0.98)
382 --warn_aln WARN_ALN Minimum aligned fraction for warnings between
383 dereplicated genomes (ANIn) (default: 0.25)
384
385 ]]></token>
386
387
388 </macros>