Mercurial > repos > iuc > drep_dereplicate
comparison macros.xml @ 0:8dfcdbeaeed8 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/drep commit 8fa5ff35b45c2b046c7f4800410cf39cb89a299a"
author | iuc |
---|---|
date | Tue, 05 May 2020 06:12:47 -0400 |
parents | |
children | ef7cd2e7bc05 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8dfcdbeaeed8 |
---|---|
1 <macros> | |
2 <token name="@VERSION@">2.5.4</token> | |
3 <xml name="requirements"> | |
4 <requirements> | |
5 <requirement type="package" version="@VERSION@">drep</requirement> | |
6 <yield/> | |
7 </requirements> | |
8 </xml> | |
9 <xml name="citations"> | |
10 <citations> | |
11 <citation type="doi">10.1038/ismej.2017.126</citation> | |
12 <yield /> | |
13 </citations> | |
14 </xml> | |
15 | |
16 | |
17 <xml name="genomes"> | |
18 <param argument="--genomes" type="data" format="fasta" label="genomes fasta files" multiple="true"/> | |
19 </xml> | |
20 <token name="@PREPARE_GENOMES@"><![CDATA[ | |
21 #import re | |
22 #set $genomefiles = [] | |
23 #for $genome in $genomes | |
24 #set $input_name = $re.sub('[^\w\-_.]', '_',str($genome.element_identifier.split('/')[-1])) | |
25 ln -s '${genome}' '${input_name}' && | |
26 $genomefiles.append($input_name) | |
27 #end for | |
28 ]]></token> | |
29 <token name="@GENOMES@"><![CDATA[ | |
30 -g | |
31 #for $genomefile in $genomefiles | |
32 '${genomefile}' | |
33 #end for | |
34 ]]></token> | |
35 | |
36 | |
37 <xml name="checkm_method"> | |
38 <param argument="--checkM_method" type="select" label="checkm method" optional="true"> | |
39 <option value="taxonomy_wf">taxonomy_wf (faster)</option> | |
40 <option value="lineage_wf">lineage_wf (more accurate)</option> | |
41 </param> | |
42 </xml> | |
43 <token name="@CHECKM_METHOD@"><![CDATA[ | |
44 #if $checkM_method: | |
45 --checkM_method $checkM_method | |
46 #end if | |
47 ]]></token> | |
48 | |
49 <xml name="filtering_options"> | |
50 <conditional name="filter"> | |
51 <param name="set_options" type="select" label="set filtering options"> | |
52 <option value="yes">Yes</option> | |
53 <option value="no" selected="true">No (use --checkM_method taxonomy_wf)</option> | |
54 </param> | |
55 <when value="yes"> | |
56 <param argument="--length" type="integer" value="50000" label="Minimum genome length"/> | |
57 <param argument="--completeness" type="integer" value="75" min="0" max="100" label="Minimum genome completeness percent"/> | |
58 <param argument="--contamination" type="integer" value="25" min="0" max="100" label="Maximum genome contamination percent"/> | |
59 | |
60 <conditional name="quality"> | |
61 <param argument="source" type="select" label="genome quality"> | |
62 <help> | |
63 --ignoreGenomeQuality is useful with | |
64 bacteriophages or eukaryotes or things where checkM | |
65 scoring does not work. Will only choose genomes based | |
66 on length and N50. | |
67 </help> | |
68 <option value="checkm" selected="true">Run checkM</option> | |
69 <option value="genomeInfo">User supplied genomeInfo csv file</option> | |
70 <option value="ignoreGenomeQuality">--ignoreGenomeQuality (NOT RECOMMENDED!)</option> | |
71 </param> | |
72 <when value="checkm"> | |
73 <param argument="--checkM_method" type="select" label="checkm method" optional="true"> | |
74 <help> | |
75 Using the checkm method of lineage_wf can require more than 40Gb of RAM. | |
76 </help> | |
77 <option value="taxonomy_wf">taxonomy_wf (faster)</option> | |
78 <option value="lineage_wf">lineage_wf (more accurate)</option> | |
79 </param> | |
80 </when> | |
81 <when value="genomeInfo"> | |
82 <param argument="--genomeInfo" type="data" format="csv" label="genomes fasta files"> | |
83 <help><![CDATA[ | |
84 A CSV dataset that must contain: [ | |
85 "genome"(history dataset name of .fasta dataset of that genome), | |
86 "completeness"(0-100 value for completeness of the genome), | |
87 "contamination"(0-100 value of the contamination of the genome)] | |
88 ]]></help> | |
89 </param> | |
90 </when> | |
91 <when value="ignoreGenomeQuality"/> | |
92 </conditional> | |
93 </when> | |
94 <when value="no"/> | |
95 </conditional> | |
96 </xml> | |
97 <token name="@FILTER_OPTIONS@"><![CDATA[ | |
98 #if $filter.set_options == 'yes': | |
99 --length $filter.length | |
100 --completeness $filter.completeness | |
101 --contamination $filter.contamination | |
102 #if $filter.quality.source == 'checkm' | |
103 --checkM_method $filter.quality.checkM_method | |
104 #elif $filter.quality.source == 'genomeInfo' | |
105 --genomeInfo $filter.quality.genomeInfo | |
106 #elif $filter.quality.source == 'ignoreGenomeQuality' | |
107 --ignoreGenomeQuality | |
108 #end if | |
109 #else | |
110 --checkM_method taxonomy_wf | |
111 #end if | |
112 ]]></token> | |
113 | |
114 <xml name="genome_comparison_options"> | |
115 <conditional name="genome_comparison"> | |
116 <param name="set_options" type="select" label="set genome comparison options"> | |
117 <option value="yes">Yes</option> | |
118 <option value="no" selected="true">No</option> | |
119 </param> | |
120 <when value="yes"> | |
121 <param argument="--MASH_sketch" type="integer" value="1000" label="MASH sketch size"/> | |
122 <param argument="--S_algorithm" type="select" label="Algorithm for secondary clustering comaprisons"> | |
123 <option value="ANImf" selected="true">ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions</option> | |
124 <option value="ANIn">ANIn = Align whole genomes with nucmer; compare aligned regions</option> | |
125 <option value="gANI">gANI = Identify and align ORFs; compare aligned ORFS</option> | |
126 </param> | |
127 <param argument="-n_PRESET" type="select" label="Presets to pass to nucmer"> | |
128 <option value="normal" selected="true">normal = default ANIn parameters (default: normal)</option> | |
129 <option value="tight">tight = only align highly conserved regions</option> | |
130 </param> | |
131 </when> | |
132 <when value="no"/> | |
133 </conditional> | |
134 </xml> | |
135 <token name="@GENOME_COMPARISON_OPTIONS@"><![CDATA[ | |
136 #if $genome_comparison.set_options == 'yes': | |
137 --MASH_sketch $genome_comparison.MASH_sketch | |
138 --S_algorithm $genome_comparison.S_algorithm | |
139 -n_PRESET $genome_comparison.n_PRESET | |
140 #end if | |
141 ]]></token> | |
142 | |
143 <xml name="clustering_options"> | |
144 <conditional name="clustering"> | |
145 <param name="set_options" type="select" label="set clustering options"> | |
146 <option value="yes">Yes</option> | |
147 <option value="no" selected="true">No</option> | |
148 </param> | |
149 <when value="yes"> | |
150 <param argument="--P_ani" type="float" value="0.9" min="0." max="1." label="ANI threshold to form primary (MASH) clusters"/> | |
151 <param argument="--S_ani" type="float" value="0.99" min="0." max="1." label="ANI threshold to form secondary clusters"/> | |
152 | |
153 <param argument="--SkipMash" type="boolean" truevalue="--SkipMash" falsevalue="" checked="false" label="Skip MASH clustering, just do secondary clustering on all genomes"/> | |
154 <param argument="--SkipSecondary" type="boolean" truevalue="--SkipSecondary" falsevalue="" checked="false" label="Skip secondary clustering, just perform MASH clustering"/> | |
155 <param argument="--cov_thresh" type="float" value="0.1" min="0." max="1." label="Minmum level of overlap between genomes when doing secondary comparisons"/> | |
156 <param argument="--coverage_method" type="select" label="Method to calculate coverage of an alignment"> | |
157 <help>(for ANIn/ANImf only; gANI can only do larger method)</help> | |
158 <option value="larger" selected="true">arger = max((aligned length / genome 1), (aligned_length / genome2))</option> | |
159 <option value="total">total = 2*(aligned length) / (sum of total genome lengths)</option> | |
160 </param> | |
161 <param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes"> | |
162 <help>(passed to scipy.cluster.hierarchy.linkage)</help> | |
163 <option value="average" selected="true">average</option> | |
164 </param> | |
165 </when> | |
166 <when value="no"/> | |
167 </conditional> | |
168 </xml> | |
169 <token name="@CLUSTERING_OPTIONS@"><![CDATA[ | |
170 #if $clustering.set_options == 'yes': | |
171 --P_ani $clustering.P_ani | |
172 --S_ani $clustering.S_ani | |
173 $clustering.SkipMash | |
174 $clustering.SkipSecondary | |
175 --cov_thresh $clustering.cov_thresh | |
176 --coverage_method $clustering.coverage_method | |
177 --clusterAlg $clustering.clusterAlg | |
178 #end if | |
179 ]]></token> | |
180 | |
181 <xml name="scoring_options"> | |
182 <conditional name="scoring"> | |
183 <param name="set_options" type="select" label="set scoring options"> | |
184 <option value="yes">Yes</option> | |
185 <option value="no" selected="true">No</option> | |
186 </param> | |
187 <when value="yes"> | |
188 <param argument="--completeness_weight" type="float" value="1" label="completeness weight"> | |
189 <help> | |
190 Based off of the formula: | |
191 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) | |
192 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight; | |
193 </help> | |
194 </param> | |
195 <param argument="--contamination_weight" type="float" value="5" label="contamination weight"/> | |
196 <param argument="--strain_heterogeneity_weight" type="float" value="1" min="0." max="1." label="strain heterogeneity weight"/> | |
197 <param argument="--N50_weight" type="float" value=".5" label="weight of log(genome N50)"/> | |
198 <param argument="--size_weight" type="float" value="0" label="weight of log(genome size)"/> | |
199 </when> | |
200 <when value="no"/> | |
201 </conditional> | |
202 </xml> | |
203 <token name="@SCORING_OPTIONS@"><![CDATA[ | |
204 #if $scoring.set_options == 'yes': | |
205 --completeness_weight $scoring.completeness_weight | |
206 --contamination_weight $scoring.contamination_weight | |
207 --strain_heterogeneity_weight $scoring.strain_heterogeneity_weight | |
208 --N50_weight $scoring.N50_weight | |
209 --size_weight $scoring.size_weight | |
210 #end if | |
211 ]]></token> | |
212 | |
213 <xml name="taxonomy_options"> | |
214 <conditional name="taxonomy"> | |
215 <param name="set_options" type="select" label="generate taxonomy information"> | |
216 <option value="yes">Yes</option> | |
217 <option value="no" selected="true">No</option> | |
218 </param> | |
219 <when value="yes"> | |
220 <param argument="--tax_method" type="select" label="Method of determining taxonomy"> | |
221 <help>(for ANIn/ANImf only; gANI can only do larger method)</help> | |
222 <option value="percent" selected="true">percent = The most descriptive taxonimic level with at least (per) hits</option> | |
223 <option value="max">max = The centrifuge taxonomic level with the most overall hits</option> | |
224 </param> | |
225 <param argument="--percent" type="float" value="50" min="0" max="100" label="minimum percent for percent method"/> | |
226 <param argument="--cent_index" type="data" format="" label="centrifuge index"/> | |
227 </when> | |
228 <when value="no"/> | |
229 </conditional> | |
230 </xml> | |
231 <token name="@TAXONOMY_OPTIONS@"><![CDATA[ | |
232 #if $taxonomy.set_options == 'yes': | |
233 --run_tax | |
234 --tax_method $taxonomy.tax_method | |
235 --percent $taxonomy.percent | |
236 --cent_index $taxonomy.cent_index | |
237 #end if | |
238 ]]></token> | |
239 | |
240 <xml name="warning_options"> | |
241 <conditional name="warning"> | |
242 <param name="set_options" type="select" label="set warning options"> | |
243 <option value="yes">Yes</option> | |
244 <option value="no" selected="true">No</option> | |
245 </param> | |
246 <when value="yes"> | |
247 <param argument="--warn_dist" type="float" value="0.25" min="0" max="1" label="How far from the threshold to throw cluster warnings"/> | |
248 <param argument="--warn_sim" type="float" value="0.98" min="0" max="1" label="Similarity threshold for warnings between dereplicated genomes"/> | |
249 <param argument="--warn_aln" type="float" value="0.25" min="0" max="1" label="Minimum aligned fraction for warnings between dereplicated genomes (ANIn)"/> | |
250 </when> | |
251 <when value="no"/> | |
252 </conditional> | |
253 </xml> | |
254 <token name="@WARNING_OPTIONS@"><![CDATA[ | |
255 #if $warning.set_options == 'yes': | |
256 --warn_dist $warning.warn_dist | |
257 --warn_sim $warning.warn_sim | |
258 --warn_aln $warning.warn_aln | |
259 #end if | |
260 ]]></token> | |
261 | |
262 <xml name="select_outputs"> | |
263 <param name="select_outputs" type="select" multiple="true" optional="false" label="Select outputs"> | |
264 <option value="log" selected="true">log</option> | |
265 <option value="warnings" selected="true">Warnings</option> | |
266 <option value="Primary_clustering_dendrogram" selected="true">Primary_clustering_dendrogram.pdf</option> | |
267 <option value="Secondary_clustering_dendrograms">Secondary_clustering_dendrograms.pdf</option> | |
268 <option value="Secondary_clustering_MDS">Secondary_clustering_MDS.pdf</option> | |
269 <option value="Clustering_scatterplots" selected="true">Clustering_scatterplots.pdf</option> | |
270 <yield/> | |
271 </param> | |
272 </xml> | |
273 <xml name="select_drep_outputs"> | |
274 <expand macro="select_outputs"> | |
275 <option value="Cluster_scoring">Cluster_scoring.pdf</option> | |
276 <option value="Winning_genomes">Winning_genomes.pdf</option> | |
277 <option value="Widb">Widb.csv</option> | |
278 <option value="Chdb">Chdb.tsv</option> | |
279 </expand> | |
280 </xml> | |
281 | |
282 <xml name="common_outputs"> | |
283 <data name="log" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/logger.log"> | |
284 <filter>'log' in select_outputs or not select_outputs</filter> | |
285 </data> | |
286 <data name="warnings" format="txt" label="${tool.name} on ${on_string}: Warnings" from_work_dir="outdir/log/warnings.txt"> | |
287 <filter>'warnings' in select_outputs</filter> | |
288 </data> | |
289 <data name="Primary_clustering_dendrogram" format="pdf" label="${tool.name} on ${on_string}: Primary_clustering_dendrogram.pdf" from_work_dir="outdir/figures/Primary_clustering_dendrogram.pdf"> | |
290 <filter>'Primary_clustering_dendrogram' in select_outputs</filter> | |
291 </data> | |
292 <data name="Secondary_clustering_dendrograms" format="pdf" label="${tool.name} on ${on_string}: Secondary_clustering_dendrograms.pdf" from_work_dir="outdir/figures/Secondary_clustering_dendrograms.pdf"> | |
293 <filter>'Secondary_clustering_dendrograms' in select_outputs</filter> | |
294 </data> | |
295 <data name="Secondary_clustering_MDS" format="pdf" label="${tool.name} on ${on_string}: Secondary_clustering_MDS.pdf" from_work_dir="outdir/figures/Secondary_clustering_MDS.pdf"> | |
296 <filter>'Secondary_clustering_MDS' in select_outputs</filter> | |
297 </data> | |
298 <data name="Clustering_scatterplots" format="pdf" label="${tool.name} on ${on_string}: Clustering_scatterplots.pdf" from_work_dir="outdir/figures/Clustering_scatterplots.pdf"> | |
299 <filter>'Clustering_scatterplots' in select_outputs</filter> | |
300 </data> | |
301 </xml> | |
302 | |
303 | |
304 <xml name="drep_outputs"> | |
305 <expand macro="common_outputs"/> | |
306 <data name="Cluster_scoring" format="pdf" label="${tool.name} on ${on_string}: Cluster_scoring.pdf" from_work_dir="outdir/figures/Cluster_scoring.pdf"> | |
307 <filter>'Cluster_scoring' in select_outputs</filter> | |
308 </data> | |
309 <data name="Winning_genomes" format="pdf" label="${tool.name} on ${on_string}: Winning_genomes.pdf" from_work_dir="outdir/figures/Winning_genomes.pdf"> | |
310 <filter>'Winning_genomes' in select_outputs</filter> | |
311 </data> | |
312 <data name="Widb" format="csv" label="${tool.name} on ${on_string}: Widb.csv" from_work_dir="outdir/data_tables/Widb.csv"> | |
313 <filter>'Widb' in select_outputs</filter> | |
314 </data> | |
315 <data name="Chdb" format="tabular" label="${tool.name} on ${on_string}: Chdb.tsv" from_work_dir="outdir/data/checkM/checkM_outdir/Chdb.tsv"> | |
316 <filter>'Chdb' in select_outputs</filter> | |
317 </data> | |
318 </xml> | |
319 | |
320 | |
321 <xml name="test_defaults_log"> | |
322 <test> | |
323 <param name="genomes" ftype="fasta" value="Enterococcus_casseliflavus_EC20.fasta,Enterococcus_faecalis_T2.fna,Enterococcus_faecalis_TX0104.fa"/> | |
324 <output name="log"> | |
325 <assert_contents> | |
326 <yield/> | |
327 </assert_contents> | |
328 </output> | |
329 </test> | |
330 </xml> | |
331 | |
332 <token name="@GENOMES_HELP@"><![CDATA[ | |
333 I/O PARAMETERS: | |
334 -g [GENOMES [GENOMES ...]], --genomes [GENOMES [GENOMES ...]] | |
335 genomes to cluster in .fasta format | |
336 (default: None) | |
337 | |
338 | |
339 ]]></token> | |
340 | |
341 <token name="@FILTERING_HELP@"><![CDATA[ | |
342 FILTERING OPTIONS: | |
343 -l LENGTH, --length LENGTH | |
344 Minimum genome length | |
345 (default: 50000) | |
346 | |
347 | |
348 -comp COMPLETENESS, --completeness COMPLETENESS | |
349 Minumum genome completeness | |
350 (default: 75) | |
351 | |
352 | |
353 -con CONTAMINATION, --contamination CONTAMINATION | |
354 Maximum genome contamination | |
355 (default: 25) | |
356 | |
357 | |
358 --ignoreGenomeQuality | |
359 Don't run checkM or do any quality filtering. NOT | |
360 RECOMMENDED! This is useful for use with | |
361 bacteriophages or eukaryotes or things where checkM | |
362 scoring does not work. Will only choose genomes based | |
363 on length and N50 (default: False) | |
364 | |
365 | |
366 ]]></token> | |
367 | |
368 <token name="@GENOME_COMPARISON_HELP@"><![CDATA[ | |
369 GENOME COMPARISON PARAMETERS: | |
370 -ms MASH_SKETCH, --MASH_sketch MASH_SKETCH | |
371 MASH sketch size (default: 1000) | |
372 | |
373 --S_algorithm {goANI,ANIn,ANImf,gANI} | |
374 Algorithm for secondary clustering comaprisons: | |
375 ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions | |
376 ANIn = Align whole genomes with nucmer; compare aligned regions | |
377 gANI = Identify and align ORFs; compare aligned ORFS | |
378 (default: ANImf) | |
379 | |
380 -n_PRESET {normal,tight} | |
381 Presets to pass to nucmer | |
382 tight = only align highly conserved regions | |
383 normal = default ANIn parameters (default: normal) | |
384 | |
385 ]]></token> | |
386 | |
387 <token name="@CLUSTERING_HELP@"><![CDATA[ | |
388 CLUSTERING PARAMETERS: | |
389 -pa P_ANI, --P_ani P_ANI | |
390 ANI threshold to form primary (MASH) clusters | |
391 (default: 0.9) | |
392 -sa S_ANI, --S_ani S_ANI | |
393 ANI threshold to form secondary clusters | |
394 (default: 0.99) | |
395 | |
396 --SkipMash Skip MASH clustering, just do secondary clustering on | |
397 all genomes (default: False) | |
398 --SkipSecondary Skip secondary clustering, just perform MASH clustering | |
399 (default: False) | |
400 | |
401 -nc COV_THRESH, --cov_thresh COV_THRESH | |
402 Minmum level of overlap between genomes when doing | |
403 secondary comparisons (default: 0.1) | |
404 -cm {total,larger}, --coverage_method {total,larger} | |
405 Method to calculate coverage of an alignment | |
406 (for ANIn/ANImf only; gANI can only do larger method) | |
407 total = 2*(aligned length) / (sum of total genome lengths) | |
408 larger = max((aligned length / genome 1), (aligned_length / genome2)) | |
409 (default: larger) | |
410 | |
411 --clusterAlg CLUSTERALG | |
412 Algorithm used to cluster genomes (passed to | |
413 scipy.cluster.hierarchy.linkage (default: average) | |
414 | |
415 ]]></token> | |
416 | |
417 <token name="@SCORING_HELP@"><![CDATA[ | |
418 SCORING CRITERIA | |
419 Based off of the formula: | |
420 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) | |
421 | |
422 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight: | |
423 -comW COMPLETENESS_WEIGHT, --completeness_weight COMPLETENESS_WEIGHT | |
424 completeness weight (default: 1) | |
425 -conW CONTAMINATION_WEIGHT, --contamination_weight CONTAMINATION_WEIGHT | |
426 contamination weight (default: 5) | |
427 -strW STRAIN_HETEROGENEITY_WEIGHT, --strain_heterogeneity_weight STRAIN_HETEROGENEITY_WEIGHT | |
428 strain heterogeneity weight (default: 1) | |
429 -N50W N50_WEIGHT, --N50_weight N50_WEIGHT | |
430 weight of log(genome N50) (default: 0.5) | |
431 -sizeW SIZE_WEIGHT, --size_weight SIZE_WEIGHT | |
432 weight of log(genome size) (default: 0) | |
433 | |
434 | |
435 ]]></token> | |
436 | |
437 <token name="@TAXONOMY_HELP@"><![CDATA[ | |
438 TAXONOMY: | |
439 --run_tax generate taxonomy information (Tdb) | |
440 (default: False) | |
441 | |
442 --tax_method {percent,max} | |
443 Method of determining taxonomy | |
444 percent = The most descriptive taxonimic level with at least (per) hits | |
445 max = The centrifuge taxonomic level with the most overall hits | |
446 (default: percent) | |
447 | |
448 | |
449 -per PERCENT, --percent PERCENT | |
450 minimum percent for percent method | |
451 (default: 50) | |
452 | |
453 | |
454 --cent_index CENT_INDEX | |
455 path to centrifuge index (for example, | |
456 /home/mattolm/download/centrifuge/indices/b+h+v | |
457 (default: None) | |
458 | |
459 ]]></token> | |
460 | |
461 <token name="@WARNINGS_HELP@"><![CDATA[ | |
462 WARNINGS: | |
463 --warn_dist WARN_DIST | |
464 How far from the threshold to throw cluster warnings | |
465 (default: 0.25) | |
466 --warn_sim WARN_SIM Similarity threshold for warnings between dereplicated | |
467 genomes (default: 0.98) | |
468 --warn_aln WARN_ALN Minimum aligned fraction for warnings between | |
469 dereplicated genomes (ANIn) (default: 0.25) | |
470 | |
471 ]]></token> | |
472 | |
473 | |
474 </macros> |