comparison macros.xml @ 0:8dfcdbeaeed8 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/drep commit 8fa5ff35b45c2b046c7f4800410cf39cb89a299a"
author iuc
date Tue, 05 May 2020 06:12:47 -0400
parents
children ef7cd2e7bc05
comparison
equal deleted inserted replaced
-1:000000000000 0:8dfcdbeaeed8
1 <macros>
2 <token name="@VERSION@">2.5.4</token>
3 <xml name="requirements">
4 <requirements>
5 <requirement type="package" version="@VERSION@">drep</requirement>
6 <yield/>
7 </requirements>
8 </xml>
9 <xml name="citations">
10 <citations>
11 <citation type="doi">10.1038/ismej.2017.126</citation>
12 <yield />
13 </citations>
14 </xml>
15
16
17 <xml name="genomes">
18 <param argument="--genomes" type="data" format="fasta" label="genomes fasta files" multiple="true"/>
19 </xml>
20 <token name="@PREPARE_GENOMES@"><![CDATA[
21 #import re
22 #set $genomefiles = []
23 #for $genome in $genomes
24 #set $input_name = $re.sub('[^\w\-_.]', '_',str($genome.element_identifier.split('/')[-1]))
25 ln -s '${genome}' '${input_name}' &&
26 $genomefiles.append($input_name)
27 #end for
28 ]]></token>
29 <token name="@GENOMES@"><![CDATA[
30 -g
31 #for $genomefile in $genomefiles
32 '${genomefile}'
33 #end for
34 ]]></token>
35
36
37 <xml name="checkm_method">
38 <param argument="--checkM_method" type="select" label="checkm method" optional="true">
39 <option value="taxonomy_wf">taxonomy_wf (faster)</option>
40 <option value="lineage_wf">lineage_wf (more accurate)</option>
41 </param>
42 </xml>
43 <token name="@CHECKM_METHOD@"><![CDATA[
44 #if $checkM_method:
45 --checkM_method $checkM_method
46 #end if
47 ]]></token>
48
49 <xml name="filtering_options">
50 <conditional name="filter">
51 <param name="set_options" type="select" label="set filtering options">
52 <option value="yes">Yes</option>
53 <option value="no" selected="true">No (use --checkM_method taxonomy_wf)</option>
54 </param>
55 <when value="yes">
56 <param argument="--length" type="integer" value="50000" label="Minimum genome length"/>
57 <param argument="--completeness" type="integer" value="75" min="0" max="100" label="Minimum genome completeness percent"/>
58 <param argument="--contamination" type="integer" value="25" min="0" max="100" label="Maximum genome contamination percent"/>
59
60 <conditional name="quality">
61 <param argument="source" type="select" label="genome quality">
62 <help>
63 --ignoreGenomeQuality is useful with
64 bacteriophages or eukaryotes or things where checkM
65 scoring does not work. Will only choose genomes based
66 on length and N50.
67 </help>
68 <option value="checkm" selected="true">Run checkM</option>
69 <option value="genomeInfo">User supplied genomeInfo csv file</option>
70 <option value="ignoreGenomeQuality">--ignoreGenomeQuality (NOT RECOMMENDED!)</option>
71 </param>
72 <when value="checkm">
73 <param argument="--checkM_method" type="select" label="checkm method" optional="true">
74 <help>
75 Using the checkm method of lineage_wf can require more than 40Gb of RAM.
76 </help>
77 <option value="taxonomy_wf">taxonomy_wf (faster)</option>
78 <option value="lineage_wf">lineage_wf (more accurate)</option>
79 </param>
80 </when>
81 <when value="genomeInfo">
82 <param argument="--genomeInfo" type="data" format="csv" label="genomes fasta files">
83 <help><![CDATA[
84 A CSV dataset that must contain: [
85 "genome"(history dataset name of .fasta dataset of that genome),
86 "completeness"(0-100 value for completeness of the genome),
87 "contamination"(0-100 value of the contamination of the genome)]
88 ]]></help>
89 </param>
90 </when>
91 <when value="ignoreGenomeQuality"/>
92 </conditional>
93 </when>
94 <when value="no"/>
95 </conditional>
96 </xml>
97 <token name="@FILTER_OPTIONS@"><![CDATA[
98 #if $filter.set_options == 'yes':
99 --length $filter.length
100 --completeness $filter.completeness
101 --contamination $filter.contamination
102 #if $filter.quality.source == 'checkm'
103 --checkM_method $filter.quality.checkM_method
104 #elif $filter.quality.source == 'genomeInfo'
105 --genomeInfo $filter.quality.genomeInfo
106 #elif $filter.quality.source == 'ignoreGenomeQuality'
107 --ignoreGenomeQuality
108 #end if
109 #else
110 --checkM_method taxonomy_wf
111 #end if
112 ]]></token>
113
114 <xml name="genome_comparison_options">
115 <conditional name="genome_comparison">
116 <param name="set_options" type="select" label="set genome comparison options">
117 <option value="yes">Yes</option>
118 <option value="no" selected="true">No</option>
119 </param>
120 <when value="yes">
121 <param argument="--MASH_sketch" type="integer" value="1000" label="MASH sketch size"/>
122 <param argument="--S_algorithm" type="select" label="Algorithm for secondary clustering comaprisons">
123 <option value="ANImf" selected="true">ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions</option>
124 <option value="ANIn">ANIn = Align whole genomes with nucmer; compare aligned regions</option>
125 <option value="gANI">gANI = Identify and align ORFs; compare aligned ORFS</option>
126 </param>
127 <param argument="-n_PRESET" type="select" label="Presets to pass to nucmer">
128 <option value="normal" selected="true">normal = default ANIn parameters (default: normal)</option>
129 <option value="tight">tight = only align highly conserved regions</option>
130 </param>
131 </when>
132 <when value="no"/>
133 </conditional>
134 </xml>
135 <token name="@GENOME_COMPARISON_OPTIONS@"><![CDATA[
136 #if $genome_comparison.set_options == 'yes':
137 --MASH_sketch $genome_comparison.MASH_sketch
138 --S_algorithm $genome_comparison.S_algorithm
139 -n_PRESET $genome_comparison.n_PRESET
140 #end if
141 ]]></token>
142
143 <xml name="clustering_options">
144 <conditional name="clustering">
145 <param name="set_options" type="select" label="set clustering options">
146 <option value="yes">Yes</option>
147 <option value="no" selected="true">No</option>
148 </param>
149 <when value="yes">
150 <param argument="--P_ani" type="float" value="0.9" min="0." max="1." label="ANI threshold to form primary (MASH) clusters"/>
151 <param argument="--S_ani" type="float" value="0.99" min="0." max="1." label="ANI threshold to form secondary clusters"/>
152
153 <param argument="--SkipMash" type="boolean" truevalue="--SkipMash" falsevalue="" checked="false" label="Skip MASH clustering, just do secondary clustering on all genomes"/>
154 <param argument="--SkipSecondary" type="boolean" truevalue="--SkipSecondary" falsevalue="" checked="false" label="Skip secondary clustering, just perform MASH clustering"/>
155 <param argument="--cov_thresh" type="float" value="0.1" min="0." max="1." label="Minmum level of overlap between genomes when doing secondary comparisons"/>
156 <param argument="--coverage_method" type="select" label="Method to calculate coverage of an alignment">
157 <help>(for ANIn/ANImf only; gANI can only do larger method)</help>
158 <option value="larger" selected="true">arger = max((aligned length / genome 1), (aligned_length / genome2))</option>
159 <option value="total">total = 2*(aligned length) / (sum of total genome lengths)</option>
160 </param>
161 <param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes">
162 <help>(passed to scipy.cluster.hierarchy.linkage)</help>
163 <option value="average" selected="true">average</option>
164 </param>
165 </when>
166 <when value="no"/>
167 </conditional>
168 </xml>
169 <token name="@CLUSTERING_OPTIONS@"><![CDATA[
170 #if $clustering.set_options == 'yes':
171 --P_ani $clustering.P_ani
172 --S_ani $clustering.S_ani
173 $clustering.SkipMash
174 $clustering.SkipSecondary
175 --cov_thresh $clustering.cov_thresh
176 --coverage_method $clustering.coverage_method
177 --clusterAlg $clustering.clusterAlg
178 #end if
179 ]]></token>
180
181 <xml name="scoring_options">
182 <conditional name="scoring">
183 <param name="set_options" type="select" label="set scoring options">
184 <option value="yes">Yes</option>
185 <option value="no" selected="true">No</option>
186 </param>
187 <when value="yes">
188 <param argument="--completeness_weight" type="float" value="1" label="completeness weight">
189 <help>
190 Based off of the formula:
191 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size)
192 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight;
193 </help>
194 </param>
195 <param argument="--contamination_weight" type="float" value="5" label="contamination weight"/>
196 <param argument="--strain_heterogeneity_weight" type="float" value="1" min="0." max="1." label="strain heterogeneity weight"/>
197 <param argument="--N50_weight" type="float" value=".5" label="weight of log(genome N50)"/>
198 <param argument="--size_weight" type="float" value="0" label="weight of log(genome size)"/>
199 </when>
200 <when value="no"/>
201 </conditional>
202 </xml>
203 <token name="@SCORING_OPTIONS@"><![CDATA[
204 #if $scoring.set_options == 'yes':
205 --completeness_weight $scoring.completeness_weight
206 --contamination_weight $scoring.contamination_weight
207 --strain_heterogeneity_weight $scoring.strain_heterogeneity_weight
208 --N50_weight $scoring.N50_weight
209 --size_weight $scoring.size_weight
210 #end if
211 ]]></token>
212
213 <xml name="taxonomy_options">
214 <conditional name="taxonomy">
215 <param name="set_options" type="select" label="generate taxonomy information">
216 <option value="yes">Yes</option>
217 <option value="no" selected="true">No</option>
218 </param>
219 <when value="yes">
220 <param argument="--tax_method" type="select" label="Method of determining taxonomy">
221 <help>(for ANIn/ANImf only; gANI can only do larger method)</help>
222 <option value="percent" selected="true">percent = The most descriptive taxonimic level with at least (per) hits</option>
223 <option value="max">max = The centrifuge taxonomic level with the most overall hits</option>
224 </param>
225 <param argument="--percent" type="float" value="50" min="0" max="100" label="minimum percent for percent method"/>
226 <param argument="--cent_index" type="data" format="" label="centrifuge index"/>
227 </when>
228 <when value="no"/>
229 </conditional>
230 </xml>
231 <token name="@TAXONOMY_OPTIONS@"><![CDATA[
232 #if $taxonomy.set_options == 'yes':
233 --run_tax
234 --tax_method $taxonomy.tax_method
235 --percent $taxonomy.percent
236 --cent_index $taxonomy.cent_index
237 #end if
238 ]]></token>
239
240 <xml name="warning_options">
241 <conditional name="warning">
242 <param name="set_options" type="select" label="set warning options">
243 <option value="yes">Yes</option>
244 <option value="no" selected="true">No</option>
245 </param>
246 <when value="yes">
247 <param argument="--warn_dist" type="float" value="0.25" min="0" max="1" label="How far from the threshold to throw cluster warnings"/>
248 <param argument="--warn_sim" type="float" value="0.98" min="0" max="1" label="Similarity threshold for warnings between dereplicated genomes"/>
249 <param argument="--warn_aln" type="float" value="0.25" min="0" max="1" label="Minimum aligned fraction for warnings between dereplicated genomes (ANIn)"/>
250 </when>
251 <when value="no"/>
252 </conditional>
253 </xml>
254 <token name="@WARNING_OPTIONS@"><![CDATA[
255 #if $warning.set_options == 'yes':
256 --warn_dist $warning.warn_dist
257 --warn_sim $warning.warn_sim
258 --warn_aln $warning.warn_aln
259 #end if
260 ]]></token>
261
262 <xml name="select_outputs">
263 <param name="select_outputs" type="select" multiple="true" optional="false" label="Select outputs">
264 <option value="log" selected="true">log</option>
265 <option value="warnings" selected="true">Warnings</option>
266 <option value="Primary_clustering_dendrogram" selected="true">Primary_clustering_dendrogram.pdf</option>
267 <option value="Secondary_clustering_dendrograms">Secondary_clustering_dendrograms.pdf</option>
268 <option value="Secondary_clustering_MDS">Secondary_clustering_MDS.pdf</option>
269 <option value="Clustering_scatterplots" selected="true">Clustering_scatterplots.pdf</option>
270 <yield/>
271 </param>
272 </xml>
273 <xml name="select_drep_outputs">
274 <expand macro="select_outputs">
275 <option value="Cluster_scoring">Cluster_scoring.pdf</option>
276 <option value="Winning_genomes">Winning_genomes.pdf</option>
277 <option value="Widb">Widb.csv</option>
278 <option value="Chdb">Chdb.tsv</option>
279 </expand>
280 </xml>
281
282 <xml name="common_outputs">
283 <data name="log" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/logger.log">
284 <filter>'log' in select_outputs or not select_outputs</filter>
285 </data>
286 <data name="warnings" format="txt" label="${tool.name} on ${on_string}: Warnings" from_work_dir="outdir/log/warnings.txt">
287 <filter>'warnings' in select_outputs</filter>
288 </data>
289 <data name="Primary_clustering_dendrogram" format="pdf" label="${tool.name} on ${on_string}: Primary_clustering_dendrogram.pdf" from_work_dir="outdir/figures/Primary_clustering_dendrogram.pdf">
290 <filter>'Primary_clustering_dendrogram' in select_outputs</filter>
291 </data>
292 <data name="Secondary_clustering_dendrograms" format="pdf" label="${tool.name} on ${on_string}: Secondary_clustering_dendrograms.pdf" from_work_dir="outdir/figures/Secondary_clustering_dendrograms.pdf">
293 <filter>'Secondary_clustering_dendrograms' in select_outputs</filter>
294 </data>
295 <data name="Secondary_clustering_MDS" format="pdf" label="${tool.name} on ${on_string}: Secondary_clustering_MDS.pdf" from_work_dir="outdir/figures/Secondary_clustering_MDS.pdf">
296 <filter>'Secondary_clustering_MDS' in select_outputs</filter>
297 </data>
298 <data name="Clustering_scatterplots" format="pdf" label="${tool.name} on ${on_string}: Clustering_scatterplots.pdf" from_work_dir="outdir/figures/Clustering_scatterplots.pdf">
299 <filter>'Clustering_scatterplots' in select_outputs</filter>
300 </data>
301 </xml>
302
303
304 <xml name="drep_outputs">
305 <expand macro="common_outputs"/>
306 <data name="Cluster_scoring" format="pdf" label="${tool.name} on ${on_string}: Cluster_scoring.pdf" from_work_dir="outdir/figures/Cluster_scoring.pdf">
307 <filter>'Cluster_scoring' in select_outputs</filter>
308 </data>
309 <data name="Winning_genomes" format="pdf" label="${tool.name} on ${on_string}: Winning_genomes.pdf" from_work_dir="outdir/figures/Winning_genomes.pdf">
310 <filter>'Winning_genomes' in select_outputs</filter>
311 </data>
312 <data name="Widb" format="csv" label="${tool.name} on ${on_string}: Widb.csv" from_work_dir="outdir/data_tables/Widb.csv">
313 <filter>'Widb' in select_outputs</filter>
314 </data>
315 <data name="Chdb" format="tabular" label="${tool.name} on ${on_string}: Chdb.tsv" from_work_dir="outdir/data/checkM/checkM_outdir/Chdb.tsv">
316 <filter>'Chdb' in select_outputs</filter>
317 </data>
318 </xml>
319
320
321 <xml name="test_defaults_log">
322 <test>
323 <param name="genomes" ftype="fasta" value="Enterococcus_casseliflavus_EC20.fasta,Enterococcus_faecalis_T2.fna,Enterococcus_faecalis_TX0104.fa"/>
324 <output name="log">
325 <assert_contents>
326 <yield/>
327 </assert_contents>
328 </output>
329 </test>
330 </xml>
331
332 <token name="@GENOMES_HELP@"><![CDATA[
333 I/O PARAMETERS:
334 -g [GENOMES [GENOMES ...]], --genomes [GENOMES [GENOMES ...]]
335 genomes to cluster in .fasta format
336 (default: None)
337
338
339 ]]></token>
340
341 <token name="@FILTERING_HELP@"><![CDATA[
342 FILTERING OPTIONS:
343 -l LENGTH, --length LENGTH
344 Minimum genome length
345 (default: 50000)
346
347
348 -comp COMPLETENESS, --completeness COMPLETENESS
349 Minumum genome completeness
350 (default: 75)
351
352
353 -con CONTAMINATION, --contamination CONTAMINATION
354 Maximum genome contamination
355 (default: 25)
356
357
358 --ignoreGenomeQuality
359 Don't run checkM or do any quality filtering. NOT
360 RECOMMENDED! This is useful for use with
361 bacteriophages or eukaryotes or things where checkM
362 scoring does not work. Will only choose genomes based
363 on length and N50 (default: False)
364
365
366 ]]></token>
367
368 <token name="@GENOME_COMPARISON_HELP@"><![CDATA[
369 GENOME COMPARISON PARAMETERS:
370 -ms MASH_SKETCH, --MASH_sketch MASH_SKETCH
371 MASH sketch size (default: 1000)
372
373 --S_algorithm {goANI,ANIn,ANImf,gANI}
374 Algorithm for secondary clustering comaprisons:
375 ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions
376 ANIn = Align whole genomes with nucmer; compare aligned regions
377 gANI = Identify and align ORFs; compare aligned ORFS
378 (default: ANImf)
379
380 -n_PRESET {normal,tight}
381 Presets to pass to nucmer
382 tight = only align highly conserved regions
383 normal = default ANIn parameters (default: normal)
384
385 ]]></token>
386
387 <token name="@CLUSTERING_HELP@"><![CDATA[
388 CLUSTERING PARAMETERS:
389 -pa P_ANI, --P_ani P_ANI
390 ANI threshold to form primary (MASH) clusters
391 (default: 0.9)
392 -sa S_ANI, --S_ani S_ANI
393 ANI threshold to form secondary clusters
394 (default: 0.99)
395
396 --SkipMash Skip MASH clustering, just do secondary clustering on
397 all genomes (default: False)
398 --SkipSecondary Skip secondary clustering, just perform MASH clustering
399 (default: False)
400
401 -nc COV_THRESH, --cov_thresh COV_THRESH
402 Minmum level of overlap between genomes when doing
403 secondary comparisons (default: 0.1)
404 -cm {total,larger}, --coverage_method {total,larger}
405 Method to calculate coverage of an alignment
406 (for ANIn/ANImf only; gANI can only do larger method)
407 total = 2*(aligned length) / (sum of total genome lengths)
408 larger = max((aligned length / genome 1), (aligned_length / genome2))
409 (default: larger)
410
411 --clusterAlg CLUSTERALG
412 Algorithm used to cluster genomes (passed to
413 scipy.cluster.hierarchy.linkage (default: average)
414
415 ]]></token>
416
417 <token name="@SCORING_HELP@"><![CDATA[
418 SCORING CRITERIA
419 Based off of the formula:
420 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size)
421
422 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight:
423 -comW COMPLETENESS_WEIGHT, --completeness_weight COMPLETENESS_WEIGHT
424 completeness weight (default: 1)
425 -conW CONTAMINATION_WEIGHT, --contamination_weight CONTAMINATION_WEIGHT
426 contamination weight (default: 5)
427 -strW STRAIN_HETEROGENEITY_WEIGHT, --strain_heterogeneity_weight STRAIN_HETEROGENEITY_WEIGHT
428 strain heterogeneity weight (default: 1)
429 -N50W N50_WEIGHT, --N50_weight N50_WEIGHT
430 weight of log(genome N50) (default: 0.5)
431 -sizeW SIZE_WEIGHT, --size_weight SIZE_WEIGHT
432 weight of log(genome size) (default: 0)
433
434
435 ]]></token>
436
437 <token name="@TAXONOMY_HELP@"><![CDATA[
438 TAXONOMY:
439 --run_tax generate taxonomy information (Tdb)
440 (default: False)
441
442 --tax_method {percent,max}
443 Method of determining taxonomy
444 percent = The most descriptive taxonimic level with at least (per) hits
445 max = The centrifuge taxonomic level with the most overall hits
446 (default: percent)
447
448
449 -per PERCENT, --percent PERCENT
450 minimum percent for percent method
451 (default: 50)
452
453
454 --cent_index CENT_INDEX
455 path to centrifuge index (for example,
456 /home/mattolm/download/centrifuge/indices/b+h+v
457 (default: None)
458
459 ]]></token>
460
461 <token name="@WARNINGS_HELP@"><![CDATA[
462 WARNINGS:
463 --warn_dist WARN_DIST
464 How far from the threshold to throw cluster warnings
465 (default: 0.25)
466 --warn_sim WARN_SIM Similarity threshold for warnings between dereplicated
467 genomes (default: 0.98)
468 --warn_aln WARN_ALN Minimum aligned fraction for warnings between
469 dereplicated genomes (ANIn) (default: 0.25)
470
471 ]]></token>
472
473
474 </macros>