comparison macros.xml @ 1:ef7cd2e7bc05 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/drep commit 5e6e589002d554be180e575080e9ad66cc78ed74"
author iuc
date Sat, 12 Feb 2022 17:40:42 +0000
parents 8dfcdbeaeed8
children 368cb4bef9d8
comparison
equal deleted inserted replaced
0:8dfcdbeaeed8 1:ef7cd2e7bc05
1 <?xml version="1.0"?>
1 <macros> 2 <macros>
2 <token name="@VERSION@">2.5.4</token> 3 <token name="@TOOL_VERSION@">3.2.2</token>
4 <token name="@VERSION_SUFFIX@">0</token>
5 <token name="@PROFILE@">20.01</token>
6 <xml name="biotools">
7 <xrefs>
8 <xref type="bio.tools">drep</xref>
9 </xrefs>
10 </xml>
3 <xml name="requirements"> 11 <xml name="requirements">
4 <requirements> 12 <requirements>
5 <requirement type="package" version="@VERSION@">drep</requirement> 13 <requirement type="package" version="@TOOL_VERSION@">drep</requirement>
6 <yield/> 14 <yield/>
7 </requirements> 15 </requirements>
8 </xml> 16 </xml>
9 <xml name="citations"> 17 <xml name="citations">
10 <citations> 18 <citations>
11 <citation type="doi">10.1038/ismej.2017.126</citation> 19 <citation type="doi">10.1038/ismej.2017.126</citation>
12 <yield /> 20 <yield />
13 </citations> 21 </citations>
14 </xml> 22 </xml>
15 23
16
17 <xml name="genomes"> 24 <xml name="genomes">
18 <param argument="--genomes" type="data" format="fasta" label="genomes fasta files" multiple="true"/> 25 <param argument="--genomes" type="data" format="fasta" multiple="true" label="Genomes to filer"/>
19 </xml> 26 </xml>
27
28 <!-- Addition of ".fasta" after names to avoid string to be read as integer
29 Bug in dRep: probably fixed in next version -->
20 <token name="@PREPARE_GENOMES@"><![CDATA[ 30 <token name="@PREPARE_GENOMES@"><![CDATA[
21 #import re 31 #import re
22 #set $genomefiles = [] 32 #set $genomefiles = []
23 #for $genome in $genomes 33 #for $genome in $genomes
24 #set $input_name = $re.sub('[^\w\-_.]', '_',str($genome.element_identifier.split('/')[-1])) 34 #set $input_name = $re.sub('[^\w\-_.]', '_',str($genome.element_identifier.split('/')[-1]))
25 ln -s '${genome}' '${input_name}' && 35 ln -s '${genome}' '${input_name}.fasta' &&
26 $genomefiles.append($input_name) 36 $genomefiles.append($input_name)
27 #end for 37 #end for
28 ]]></token> 38 ]]></token>
29 <token name="@GENOMES@"><![CDATA[ 39 <token name="@GENOMES@"><![CDATA[
30 -g 40 -g
31 #for $genomefile in $genomefiles 41 #for $genomefile in $genomefiles
32 '${genomefile}' 42 '${genomefile}.fasta'
33 #end for 43 #end for
34 ]]></token> 44 ]]></token>
35
36
37 <xml name="checkm_method">
38 <param argument="--checkM_method" type="select" label="checkm method" optional="true">
39 <option value="taxonomy_wf">taxonomy_wf (faster)</option>
40 <option value="lineage_wf">lineage_wf (more accurate)</option>
41 </param>
42 </xml>
43 <token name="@CHECKM_METHOD@"><![CDATA[
44 #if $checkM_method:
45 --checkM_method $checkM_method
46 #end if
47 ]]></token>
48 45
49 <xml name="filtering_options"> 46 <xml name="filtering_options">
50 <conditional name="filter"> 47 <section name="filter" title="Genome filtering" expanded="true">
51 <param name="set_options" type="select" label="set filtering options"> 48 <param argument="--length" type="integer" value="50000" label="Minimum genome length"/>
52 <option value="yes">Yes</option> 49 <param argument="--completeness" type="integer" value="75" min="0" max="100" label="Minimum genome completeness percent"/>
53 <option value="no" selected="true">No (use --checkM_method taxonomy_wf)</option> 50 <param argument="--contamination" type="integer" value="25" min="0" max="100" label="Maximum genome contamination percent"/>
51 </section>
52 </xml>
53 <xml name="test_default_filtering_options">
54 <section name="filter">
55 <param name="length" value="50000"/>
56 <param name="completeness" value="75"/>
57 <param name="contamination" value="100"/>
58 </section>
59 </xml>
60 <token name="@FILTER_OPTIONS@"><![CDATA[
61 --length $filter.length
62 --completeness $filter.completeness
63 --contamination $filter.contamination
64 ]]></token>
65
66 <xml name="quality_assessment_options">
67 <conditional name="quality">
68 <param name="source" type="select" label="Genome quality filtering" help="No checkM or quality filtering is not recommened but with bacteriophages or eukaryotes or things where checkM scoring does not work. Will only choose genomes based on length and N50.">
69 <option value="checkm" selected="true">Run checkM</option>
70 <option value="genomeInfo">Provide quality information on the genome (CSV file)</option>
71 <option value="ignoreGenomeQuality">Don't run checkM or do any quality filtering (--ignoreGenomeQuality) - NOT RECOMMENDED!</option>
54 </param> 72 </param>
55 <when value="yes"> 73 <when value="checkm">
56 <param argument="--length" type="integer" value="50000" label="Minimum genome length"/> 74 <param argument="--checkM_method" type="select" label="CheckM method">
57 <param argument="--completeness" type="integer" value="75" min="0" max="100" label="Minimum genome completeness percent"/> 75 <option value="lineage_wf" selected="true">lineage_wf: Lineage-specific Workflow - quality estimates with lineage-specific markers (more accurate)</option>
58 <param argument="--contamination" type="integer" value="25" min="0" max="100" label="Maximum genome contamination percent"/> 76 <option value="taxonomy_wf">taxonomy_wf: Taxonomic-specific Workflow - quality estimates with taxonomic-specific markers (faster)</option>
59 77 </param>
60 <conditional name="quality"> 78 <param argument="--set_recursion" type="integer" optional="true" label="Increases the python recursion limit" help="NOT RECOMMENDED unless checkM is crashing due to recursion issues. Recommended to set to 2000 if needed, but setting this could crash Python"/>
61 <param argument="source" type="select" label="genome quality"> 79 <param argument="--checkm_group_size" type="integer" value="2000" min="1" label="Number of genomes passed to checkM at a time" help="Increasing this increases RAM but makes checkM faster"/>
62 <help>
63 --ignoreGenomeQuality is useful with
64 bacteriophages or eukaryotes or things where checkM
65 scoring does not work. Will only choose genomes based
66 on length and N50.
67 </help>
68 <option value="checkm" selected="true">Run checkM</option>
69 <option value="genomeInfo">User supplied genomeInfo csv file</option>
70 <option value="ignoreGenomeQuality">--ignoreGenomeQuality (NOT RECOMMENDED!)</option>
71 </param>
72 <when value="checkm">
73 <param argument="--checkM_method" type="select" label="checkm method" optional="true">
74 <help>
75 Using the checkm method of lineage_wf can require more than 40Gb of RAM.
76 </help>
77 <option value="taxonomy_wf">taxonomy_wf (faster)</option>
78 <option value="lineage_wf">lineage_wf (more accurate)</option>
79 </param>
80 </when>
81 <when value="genomeInfo">
82 <param argument="--genomeInfo" type="data" format="csv" label="genomes fasta files">
83 <help><![CDATA[
84 A CSV dataset that must contain: [
85 "genome"(history dataset name of .fasta dataset of that genome),
86 "completeness"(0-100 value for completeness of the genome),
87 "contamination"(0-100 value of the contamination of the genome)]
88 ]]></help>
89 </param>
90 </when>
91 <when value="ignoreGenomeQuality"/>
92 </conditional>
93 </when> 80 </when>
94 <when value="no"/> 81 <when value="genomeInfo">
95 </conditional> 82 <param argument="--genomeInfo" type="data" format="csv" label="Quality information on the genomes">
96 </xml> 83 <help><![CDATA[
97 <token name="@FILTER_OPTIONS@"><![CDATA[ 84 A CSV dataset that must contain: [
98 #if $filter.set_options == 'yes': 85 "genome"(history dataset name of .fasta dataset of that genome),
99 --length $filter.length 86 "completeness"(0-100 value for completeness of the genome),
100 --completeness $filter.completeness 87 "contamination"(0-100 value of the contamination of the genome)]
101 --contamination $filter.contamination 88 ]]></help>
102 #if $filter.quality.source == 'checkm'
103 --checkM_method $filter.quality.checkM_method
104 #elif $filter.quality.source == 'genomeInfo'
105 --genomeInfo $filter.quality.genomeInfo
106 #elif $filter.quality.source == 'ignoreGenomeQuality'
107 --ignoreGenomeQuality
108 #end if
109 #else
110 --checkM_method taxonomy_wf
111 #end if
112 ]]></token>
113
114 <xml name="genome_comparison_options">
115 <conditional name="genome_comparison">
116 <param name="set_options" type="select" label="set genome comparison options">
117 <option value="yes">Yes</option>
118 <option value="no" selected="true">No</option>
119 </param>
120 <when value="yes">
121 <param argument="--MASH_sketch" type="integer" value="1000" label="MASH sketch size"/>
122 <param argument="--S_algorithm" type="select" label="Algorithm for secondary clustering comaprisons">
123 <option value="ANImf" selected="true">ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions</option>
124 <option value="ANIn">ANIn = Align whole genomes with nucmer; compare aligned regions</option>
125 <option value="gANI">gANI = Identify and align ORFs; compare aligned ORFS</option>
126 </param>
127 <param argument="-n_PRESET" type="select" label="Presets to pass to nucmer">
128 <option value="normal" selected="true">normal = default ANIn parameters (default: normal)</option>
129 <option value="tight">tight = only align highly conserved regions</option>
130 </param> 89 </param>
131 </when> 90 </when>
132 <when value="no"/> 91 <when value="ignoreGenomeQuality"/>
133 </conditional> 92 </conditional>
134 </xml> 93 </xml>
135 <token name="@GENOME_COMPARISON_OPTIONS@"><![CDATA[ 94 <xml name="test_default_quality_assessment_options">
136 #if $genome_comparison.set_options == 'yes': 95 <conditional name="quality">
137 --MASH_sketch $genome_comparison.MASH_sketch 96 <param name="source" value="checkm"/>
138 --S_algorithm $genome_comparison.S_algorithm 97 <param name="checkM_method" value="taxonomy_wf"/>
139 -n_PRESET $genome_comparison.n_PRESET 98 <param name="checkm_group_size" value="2000"/>
140 #end if 99 </conditional>
141 ]]></token> 100 </xml>
142 101 <token name="@QUALITY_ASSESSMENT_OPTIONS@"><![CDATA[
143 <xml name="clustering_options"> 102 #if $quality.source == 'checkm'
103 --checkM_method '$quality.checkM_method'
104 #if str($quality.set_recursion) != ''
105 --set_recurison $filter.set_recursion
106 #end if
107 --checkm_group_size $quality.checkm_group_size
108 #else if $quality.source == 'genomeInfo'
109 --genomeInfo '$quality.genomeInfo'
110 #else if $quality.source == 'ignoreGenomeQuality'
111 --ignoreGenomeQuality
112 #end if
113 ]]></token>
114
115 <xml name="mash">
116 <param argument="--MASH_sketch" type="integer" value="1000" min="0" label="MASH sketch size"/>
117 <param argument="--P_ani" type="float" value="0.9" min="0." max="1." label="ANI threshold to form primary clusters"/>
118 <param argument="--multiround_primary_clustering" type='boolean' checked="false" truevalue='--multiround_primary_clustering' falsevalue='' label="Cluster each primary clunk separately and merge at the end with single linkage?" help="Decreases RAM usage and increases speed, and the cost of a minor loss in precision and the inability to plot primary_clustering_dendrograms. Especially helpful when clustering 5000+ genomes. Will be done with single linkage clustering"/>
119 <param argument="--primary_chunksize" type="integer" value="5000" min="1" label="Impacts multiround_primary_clusterings" help=" If you have more than this many genomes, process them in chunks of this size"/>
120 </xml>
121 <xml name="test_default_mash">
122 <param name="MASH_sketch" value="1000"/>
123 <param name="P_ani" value="0.9"/>
124 <param name="multiround_primary_clustering" value=''/>
125 <param name="primary_chunksize" value="5000"/>
126 </xml>
127 <token name="@MASH@"><![CDATA[
128 --MASH_sketch '$comp_clust.steps.MASH_sketch'
129 --P_ani $comp_clust.steps.P_ani
130 $comp_clust.steps.multiround_primary_clustering
131 --primary_chunksize $comp_clust.steps.primary_chunksize
132 ]]></token>
133
134 <xml name="nucmer">
135 <param argument="--n_PRESET" type="select" label="Presets to pass to nucmer">
136 <option value="normal" selected="true">normal: default ANIn parameters</option>
137 <option value="tight">tight: only align highly conserved regions</option>
138 </param>
139 </xml>
140 <xml name="test_default_nucmer">
141 <param name="n_PRESET" value="normal"/>
142 </xml>
143 <token name="@NUCMER@"><![CDATA[
144 --n_PRESET '$comp_clust.steps.clustering.n_PRESET'
145 ]]></token>
146
147 <xml name="coverage_method">
148 <param argument="--coverage_method" type="select" label="Method to calculate coverage of an alignment">
149 <option value="larger" selected="true">Larger = max((aligned length / genome 1), (aligned_length / genome2))</option>
150 <option value="total">Total = 2*(aligned length) / (sum of total genome lengths)</option>
151 </param>
152 </xml>
153 <xml name="test_default_coverage_method">
154 <param name="coverage_method" value="larger"/>
155 </xml>
156 <token name="@COVERAGE_METHOD@"><![CDATA[
157 --coverage_method '$comp_clust.steps.clustering.coverage_method'
158 ]]></token>
159
160 <xml name="secondary_clustering">
144 <conditional name="clustering"> 161 <conditional name="clustering">
145 <param name="set_options" type="select" label="set clustering options"> 162 <param argument="--S_algorithm" type="select" label="Algorithm for secondary clustering comparisons">
146 <option value="yes">Yes</option> 163 <option value="fastANI">fastANI: Kmer-based approach - very fast</option>
147 <option value="no" selected="true">No</option> 164 <option value="ANImf" selected="true">ANImf: Align whole genomes with nucmer; filter alignment; compare aligned regions - RECOMMENDED</option>
165 <option value="ANIn">ANIn: Align whole genomes with nucmer; compare aligned regions</option>
166 <option value="gANI">gANI: Identify and align ORFs; compare aligned ORFS</option>
167 <option value="goANI">Open source version of gANI; requires nsmimscan</option>
148 </param> 168 </param>
149 <when value="yes"> 169 <when value="fastANI">
150 <param argument="--P_ani" type="float" value="0.9" min="0." max="1." label="ANI threshold to form primary (MASH) clusters"/> 170 <param argument="--greedy_secondary_clustering" type='boolean' checked="false" truevalue='--greedy_secondary_clustering' falsevalue='' label="Use a heuristic to avoid pair-wise comparisons when doing secondary clustering?" help="Will be done with single linkage clustering"/>
151 <param argument="--S_ani" type="float" value="0.99" min="0." max="1." label="ANI threshold to form secondary clusters"/> 171 </when>
152 172 <when value="ANImf">
153 <param argument="--SkipMash" type="boolean" truevalue="--SkipMash" falsevalue="" checked="false" label="Skip MASH clustering, just do secondary clustering on all genomes"/> 173 <expand macro="nucmer"/>
154 <param argument="--SkipSecondary" type="boolean" truevalue="--SkipSecondary" falsevalue="" checked="false" label="Skip secondary clustering, just perform MASH clustering"/> 174 <expand macro="coverage_method"/>
155 <param argument="--cov_thresh" type="float" value="0.1" min="0." max="1." label="Minmum level of overlap between genomes when doing secondary comparisons"/> 175 </when>
156 <param argument="--coverage_method" type="select" label="Method to calculate coverage of an alignment"> 176 <when value="ANIn">
157 <help>(for ANIn/ANImf only; gANI can only do larger method)</help> 177 <expand macro="nucmer"/>
158 <option value="larger" selected="true">arger = max((aligned length / genome 1), (aligned_length / genome2))</option> 178 <expand macro="coverage_method"/>
159 <option value="total">total = 2*(aligned length) / (sum of total genome lengths)</option> 179 </when>
180 <when value="gANI"/>
181 <when value="goANI"/>
182 </conditional>
183 <param argument="--S_ani" type="float" value="0.99" min="0." max="1." label="ANI threshold to form secondary clusters"/>
184 <param argument="--cov_thresh" type="float" value="0.1" min="0." max="1." label="Minmum level of overlap between genomes when doing secondary comparisons"/>
185 </xml>
186 <xml name="test_default_secondary_clustering">
187 <conditional name="clustering">
188 <param name="S_algorithm" value="ANImf"/>
189 <expand macro="test_default_nucmer"/>
190 <expand macro="test_default_coverage_method"/>
191 </conditional>
192 <param name="S_ani" value="0.99"/>
193 <param name="cov_thresh" value="0.1"/>
194 </xml>
195 <token name="@SECONDARY_CLUSTERING@"><![CDATA[
196 --S_algorithm '$comp_clust.steps.clustering.S_algorithm'
197 #if $comp_clust.steps.clustering.S_algorithm == 'fastANI'
198 $comp_clust.steps.clustering.greedy_secondary_clustering
199 #else if $comp_clust.steps.clustering.S_algorithm == 'ANImf'
200 @NUCMER@
201 @COVERAGE_METHOD@
202 #else if $comp_clust.steps.clustering.S_algorithm == 'ANIn'
203 @NUCMER@
204 @COVERAGE_METHOD@
205 #end if
206 --S_ani $comp_clust.steps.S_ani
207 --cov_thresh $comp_clust.steps.cov_thresh
208 ]]></token>
209
210 <xml name="comparison_clustering_options">
211 <section name="comp_clust" title="Genome comparison and clustering" expanded="false">
212 <conditional name="steps">
213 <param name="select" type="select" label="Steps in genome comparison">
214 <option value="default" selected="true">Default: Run MASH clustering and a secondary clustering</option>
215 <option value="SkipMash">Skip MASH clustering, just do secondary clustering on all genomes</option>
216 <option value="SkipSecondary">Skip secondary clustering, just perform MASH clustering</option>
160 </param> 217 </param>
161 <param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes"> 218 <when value="default">
162 <help>(passed to scipy.cluster.hierarchy.linkage)</help> 219 <expand macro="mash"/>
163 <option value="average" selected="true">average</option> 220 <expand macro="secondary_clustering"/>
164 </param> 221 </when>
165 </when> 222 <when value="SkipMash">
166 <when value="no"/> 223 <expand macro="secondary_clustering"/>
167 </conditional> 224 </when>
168 </xml> 225 <when value="SkipSecondary">
169 <token name="@CLUSTERING_OPTIONS@"><![CDATA[ 226 <expand macro="mash"/>
170 #if $clustering.set_options == 'yes': 227 </when>
171 --P_ani $clustering.P_ani 228 </conditional>
172 --S_ani $clustering.S_ani 229 <param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes" help="Passed to scipy.cluster.hierarchy.linkage">
173 $clustering.SkipMash 230 <option value="average" selected="true">average</option>
174 $clustering.SkipSecondary 231 <option value="ward">ward</option>
175 --cov_thresh $clustering.cov_thresh 232 <option value="single">single</option>
176 --coverage_method $clustering.coverage_method 233 <option value="median">median</option>
177 --clusterAlg $clustering.clusterAlg 234 <option value="centroid">centroid</option>
178 #end if 235 <option value="weighted">weighted</option>
236 </param>
237 <param argument="--run_tertiary_clustering" type='boolean' checked="false" truevalue='--run_tertiary_clustering' falsevalue='' label="Run an additional round of clustering on the final genome set?" help="This is especially useful when greedy clustering is performed and/or to handle cases where similar genomes end up in different primary clusters."/>
238 </section>
239 </xml>
240 <xml name="test_default_comparison_clustering_options">
241 <section name="comp_clust">
242 <conditional name="steps">
243 <param name="select" value="default" />
244 <expand macro="test_default_mash"/>
245 <expand macro="test_default_secondary_clustering"/>
246 </conditional>
247 <param name="clusterAlg" value="average"/>
248 <param name="run_tertiary_clustering" value=''/>
249 </section>
250 </xml>
251 <token name="@COMPARISON_CLUSTERING_OPTIONS@"><![CDATA[
252 #if $comp_clust.steps.select == 'default'
253 @MASH@
254 @SECONDARY_CLUSTERING@
255 #else if $comp_clust.steps.select == 'SkipMash'
256 --SkipMash
257 @SECONDARY_CLUSTERING@
258 #else
259 @MASH@
260 --SkipSecondary
261 #end if
262 --clusterAlg '$comp_clust.clusterAlg'
263 $comp_clust.run_tertiary_clustering
179 ]]></token> 264 ]]></token>
180 265
181 <xml name="scoring_options"> 266 <xml name="scoring_options">
182 <conditional name="scoring"> 267 <section name="scoring" title="Scoring criteria" expanded="false" help="Based off of the formula: A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) + F*(centrality - S_ani). With A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight; F = cent_weight">
183 <param name="set_options" type="select" label="set scoring options"> 268 <param argument="--completeness_weight" type="float" value="1" label="Completeness weight"/>
184 <option value="yes">Yes</option> 269 <param argument="--contamination_weight" type="float" value="5" label="Contamination weight"/>
185 <option value="no" selected="true">No</option> 270 <param argument="--strain_heterogeneity_weight" type="float" value="1" min="0." max="1." label="Strain heterogeneity weight"/>
186 </param> 271 <param argument="--N50_weight" type="float" value=".5" label="Weight of log(genome N50)"/>
187 <when value="yes"> 272 <param argument="--size_weight" type="float" value="0" label="Weight of log(genome size)"/>
188 <param argument="--completeness_weight" type="float" value="1" label="completeness weight"> 273 <param argument="--centrality_weight" type="float" value="1" label="Weight of (centrality - S_ani)"/>
189 <help> 274 </section>
190 Based off of the formula: 275 </xml>
191 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) 276 <xml name="test_default_scoring_options">
192 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight; 277 <section name="scoring">
193 </help> 278 <param name="completeness_weight" value="1"/>
194 </param> 279 <param name="contamination_weight" value="5"/>
195 <param argument="--contamination_weight" type="float" value="5" label="contamination weight"/> 280 <param name="strain_heterogeneity_weight" value="1"/>
196 <param argument="--strain_heterogeneity_weight" type="float" value="1" min="0." max="1." label="strain heterogeneity weight"/> 281 <param name="N50_weight" value=".5" />
197 <param argument="--N50_weight" type="float" value=".5" label="weight of log(genome N50)"/> 282 <param name="size_weight" value="0"/>
198 <param argument="--size_weight" type="float" value="0" label="weight of log(genome size)"/> 283 <param name="centrality_weight" value="1"/>
199 </when> 284 </section>
200 <when value="no"/>
201 </conditional>
202 </xml> 285 </xml>
203 <token name="@SCORING_OPTIONS@"><![CDATA[ 286 <token name="@SCORING_OPTIONS@"><![CDATA[
204 #if $scoring.set_options == 'yes': 287 --completeness_weight $scoring.completeness_weight
205 --completeness_weight $scoring.completeness_weight 288 --contamination_weight $scoring.contamination_weight
206 --contamination_weight $scoring.contamination_weight 289 --strain_heterogeneity_weight $scoring.strain_heterogeneity_weight
207 --strain_heterogeneity_weight $scoring.strain_heterogeneity_weight 290 --N50_weight $scoring.N50_weight
208 --N50_weight $scoring.N50_weight 291 --size_weight $scoring.size_weight
209 --size_weight $scoring.size_weight 292 --centrality_weight $scoring.centrality_weight
210 #end if 293 ]]></token>
211 ]]></token> 294
212 295 <xml name="warning_options">
213 <xml name="taxonomy_options"> 296 <section name="warning" title="Warnings" expanded="false">
214 <conditional name="taxonomy"> 297 <param argument="--warn_dist" type="float" value="0.25" min="0" max="1" label="How far from the threshold to throw cluster warnings"/>
215 <param name="set_options" type="select" label="generate taxonomy information"> 298 <param argument="--warn_sim" type="float" value="0.98" min="0" max="1" label="Similarity threshold for warnings between dereplicated genomes"/>
216 <option value="yes">Yes</option> 299 <param argument="--warn_aln" type="float" value="0.25" min="0" max="1" label="Minimum aligned fraction for warnings between dereplicated genomes (ANIn)"/>
217 <option value="no" selected="true">No</option> 300 </section>
218 </param> 301 </xml>
219 <when value="yes"> 302 <xml name="test_default_warning_options">
220 <param argument="--tax_method" type="select" label="Method of determining taxonomy"> 303 <section name="warning">
221 <help>(for ANIn/ANImf only; gANI can only do larger method)</help> 304 <param name="warn_dist" value="0.25"/>
222 <option value="percent" selected="true">percent = The most descriptive taxonimic level with at least (per) hits</option> 305 <param name="warn_sim" value="0.98"/>
223 <option value="max">max = The centrifuge taxonomic level with the most overall hits</option> 306 <param name="warn_aln" value="0.25"/>
224 </param> 307 </section>
225 <param argument="--percent" type="float" value="50" min="0" max="100" label="minimum percent for percent method"/>
226 <param argument="--cent_index" type="data" format="" label="centrifuge index"/>
227 </when>
228 <when value="no"/>
229 </conditional>
230 </xml>
231 <token name="@TAXONOMY_OPTIONS@"><![CDATA[
232 #if $taxonomy.set_options == 'yes':
233 --run_tax
234 --tax_method $taxonomy.tax_method
235 --percent $taxonomy.percent
236 --cent_index $taxonomy.cent_index
237 #end if
238 ]]></token>
239
240 <xml name="warning_options">
241 <conditional name="warning">
242 <param name="set_options" type="select" label="set warning options">
243 <option value="yes">Yes</option>
244 <option value="no" selected="true">No</option>
245 </param>
246 <when value="yes">
247 <param argument="--warn_dist" type="float" value="0.25" min="0" max="1" label="How far from the threshold to throw cluster warnings"/>
248 <param argument="--warn_sim" type="float" value="0.98" min="0" max="1" label="Similarity threshold for warnings between dereplicated genomes"/>
249 <param argument="--warn_aln" type="float" value="0.25" min="0" max="1" label="Minimum aligned fraction for warnings between dereplicated genomes (ANIn)"/>
250 </when>
251 <when value="no"/>
252 </conditional>
253 </xml> 308 </xml>
254 <token name="@WARNING_OPTIONS@"><![CDATA[ 309 <token name="@WARNING_OPTIONS@"><![CDATA[
255 #if $warning.set_options == 'yes': 310 --warn_dist $warning.warn_dist
256 --warn_dist $warning.warn_dist 311 --warn_sim $warning.warn_sim
257 --warn_sim $warning.warn_sim 312 --warn_aln $warning.warn_aln
258 --warn_aln $warning.warn_aln
259 #end if
260 ]]></token> 313 ]]></token>
261 314
262 <xml name="select_outputs"> 315 <xml name="select_outputs">
263 <param name="select_outputs" type="select" multiple="true" optional="false" label="Select outputs"> 316 <param name="select_outputs" type="select" multiple="true" optional="false" label="Select outputs">
264 <option value="log" selected="true">log</option> 317 <option value="log" selected="true">log</option>
276 <option value="Winning_genomes">Winning_genomes.pdf</option> 329 <option value="Winning_genomes">Winning_genomes.pdf</option>
277 <option value="Widb">Widb.csv</option> 330 <option value="Widb">Widb.csv</option>
278 <option value="Chdb">Chdb.tsv</option> 331 <option value="Chdb">Chdb.tsv</option>
279 </expand> 332 </expand>
280 </xml> 333 </xml>
281 334 <xml name="test_default_select_drep_outputs">
282 <xml name="common_outputs"> 335 <param name="select_outputs" value="log,warnings,Primary_clustering_dendrogram,Clustering_scatterplots,Cluster_scoring,Winning_genomes,Widb" />
336 </xml>
337 <xml name="test_default_select_outputs">
338 <param name="select_outputs" value="log,warnings,Primary_clustering_dendrogram,Clustering_scatterplots" />
339 </xml>
340
341 <xml name="common_outputs">
283 <data name="log" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/logger.log"> 342 <data name="log" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/logger.log">
284 <filter>'log' in select_outputs or not select_outputs</filter> 343 <filter>'log' in select_outputs or not select_outputs</filter>
285 </data> 344 </data>
286 <data name="warnings" format="txt" label="${tool.name} on ${on_string}: Warnings" from_work_dir="outdir/log/warnings.txt"> 345 <data name="warnings" format="txt" label="${tool.name} on ${on_string}: Warnings" from_work_dir="outdir/log/warnings.txt">
287 <filter>'warnings' in select_outputs</filter> 346 <filter>'warnings' in select_outputs</filter>
297 </data> 356 </data>
298 <data name="Clustering_scatterplots" format="pdf" label="${tool.name} on ${on_string}: Clustering_scatterplots.pdf" from_work_dir="outdir/figures/Clustering_scatterplots.pdf"> 357 <data name="Clustering_scatterplots" format="pdf" label="${tool.name} on ${on_string}: Clustering_scatterplots.pdf" from_work_dir="outdir/figures/Clustering_scatterplots.pdf">
299 <filter>'Clustering_scatterplots' in select_outputs</filter> 358 <filter>'Clustering_scatterplots' in select_outputs</filter>
300 </data> 359 </data>
301 </xml> 360 </xml>
302
303
304 <xml name="drep_outputs"> 361 <xml name="drep_outputs">
305 <expand macro="common_outputs"/> 362 <expand macro="common_outputs"/>
306 <data name="Cluster_scoring" format="pdf" label="${tool.name} on ${on_string}: Cluster_scoring.pdf" from_work_dir="outdir/figures/Cluster_scoring.pdf"> 363 <data name="Cluster_scoring" format="pdf" label="${tool.name} on ${on_string}: Cluster_scoring.pdf" from_work_dir="outdir/figures/Cluster_scoring.pdf">
307 <filter>'Cluster_scoring' in select_outputs</filter> 364 <filter>'Cluster_scoring' in select_outputs</filter>
308 </data> 365 </data>
314 </data> 371 </data>
315 <data name="Chdb" format="tabular" label="${tool.name} on ${on_string}: Chdb.tsv" from_work_dir="outdir/data/checkM/checkM_outdir/Chdb.tsv"> 372 <data name="Chdb" format="tabular" label="${tool.name} on ${on_string}: Chdb.tsv" from_work_dir="outdir/data/checkM/checkM_outdir/Chdb.tsv">
316 <filter>'Chdb' in select_outputs</filter> 373 <filter>'Chdb' in select_outputs</filter>
317 </data> 374 </data>
318 </xml> 375 </xml>
319 376 <xml name="test_string_inputs">
320 377 <param name="genomes" ftype="fasta" value="Enterococcus_casseliflavus_EC20.fasta,Enterococcus_faecalis_T2.fna,Enterococcus_faecalis_TX0104.fa"/>
321 <xml name="test_defaults_log"> 378 </xml>
322 <test> 379 <xml name="test_integer_inputs">
323 <param name="genomes" ftype="fasta" value="Enterococcus_casseliflavus_EC20.fasta,Enterococcus_faecalis_T2.fna,Enterococcus_faecalis_TX0104.fa"/> 380 <param name="genomes" ftype="fasta" value="001,002,003"/>
324 <output name="log"> 381 </xml>
325 <assert_contents> 382 <xml name="test_log_output">
326 <yield/> 383 <output name="log">
327 </assert_contents> 384 <assert_contents>
328 </output> 385 <yield/>
329 </test> 386 </assert_contents>
330 </xml> 387 </output>
331 388 </xml>
332 <token name="@GENOMES_HELP@"><![CDATA[ 389 <token name="@GENOMES_HELP@"><![CDATA[
333 I/O PARAMETERS: 390 I/O PARAMETERS:
334 -g [GENOMES [GENOMES ...]], --genomes [GENOMES [GENOMES ...]] 391 -g [GENOMES [GENOMES ...]], --genomes [GENOMES [GENOMES ...]]
335 genomes to cluster in .fasta format 392 genomes to cluster in .fasta format
336 (default: None) 393 (default: None)
337 394
338 395
339 ]]></token> 396 ]]></token>
340
341 <token name="@FILTERING_HELP@"><![CDATA[ 397 <token name="@FILTERING_HELP@"><![CDATA[
342 FILTERING OPTIONS: 398 FILTERING OPTIONS:
343 -l LENGTH, --length LENGTH 399 -l LENGTH, --length LENGTH
344 Minimum genome length 400 Minimum genome length
345 (default: 50000) 401 (default: 50000)
362 scoring does not work. Will only choose genomes based 418 scoring does not work. Will only choose genomes based
363 on length and N50 (default: False) 419 on length and N50 (default: False)
364 420
365 421
366 ]]></token> 422 ]]></token>
367
368 <token name="@GENOME_COMPARISON_HELP@"><![CDATA[ 423 <token name="@GENOME_COMPARISON_HELP@"><![CDATA[
369 GENOME COMPARISON PARAMETERS: 424 GENOME COMPARISON PARAMETERS:
370 -ms MASH_SKETCH, --MASH_sketch MASH_SKETCH 425 -ms MASH_SKETCH, --MASH_sketch MASH_SKETCH
371 MASH sketch size (default: 1000) 426 MASH sketch size (default: 1000)
372 427
381 Presets to pass to nucmer 436 Presets to pass to nucmer
382 tight = only align highly conserved regions 437 tight = only align highly conserved regions
383 normal = default ANIn parameters (default: normal) 438 normal = default ANIn parameters (default: normal)
384 439
385 ]]></token> 440 ]]></token>
386
387 <token name="@CLUSTERING_HELP@"><![CDATA[ 441 <token name="@CLUSTERING_HELP@"><![CDATA[
388 CLUSTERING PARAMETERS: 442 CLUSTERING PARAMETERS:
389 -pa P_ANI, --P_ani P_ANI 443 -pa P_ANI, --P_ani P_ANI
390 ANI threshold to form primary (MASH) clusters 444 ANI threshold to form primary (MASH) clusters
391 (default: 0.9) 445 (default: 0.9)
411 --clusterAlg CLUSTERALG 465 --clusterAlg CLUSTERALG
412 Algorithm used to cluster genomes (passed to 466 Algorithm used to cluster genomes (passed to
413 scipy.cluster.hierarchy.linkage (default: average) 467 scipy.cluster.hierarchy.linkage (default: average)
414 468
415 ]]></token> 469 ]]></token>
416
417 <token name="@SCORING_HELP@"><![CDATA[ 470 <token name="@SCORING_HELP@"><![CDATA[
418 SCORING CRITERIA 471 SCORING CRITERIA
419 Based off of the formula: 472 Based off of the formula:
420 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) 473 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size)
421 474
422 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight: 475 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight:
423 -comW COMPLETENESS_WEIGHT, --completeness_weight COMPLETENESS_WEIGHT 476 -comW COMPLETENESS_WEIGHT, --completeness_weight COMPLETENESS_WEIGHT
424 completeness weight (default: 1) 477 completeness weight (default: 1)
431 -sizeW SIZE_WEIGHT, --size_weight SIZE_WEIGHT 484 -sizeW SIZE_WEIGHT, --size_weight SIZE_WEIGHT
432 weight of log(genome size) (default: 0) 485 weight of log(genome size) (default: 0)
433 486
434 487
435 ]]></token> 488 ]]></token>
436
437 <token name="@TAXONOMY_HELP@"><![CDATA[ 489 <token name="@TAXONOMY_HELP@"><![CDATA[
438 TAXONOMY: 490 TAXONOMY:
439 --run_tax generate taxonomy information (Tdb) 491 --run_tax generate taxonomy information (Tdb)
440 (default: False) 492 (default: False)
441 493
455 path to centrifuge index (for example, 507 path to centrifuge index (for example,
456 /home/mattolm/download/centrifuge/indices/b+h+v 508 /home/mattolm/download/centrifuge/indices/b+h+v
457 (default: None) 509 (default: None)
458 510
459 ]]></token> 511 ]]></token>
460
461 <token name="@WARNINGS_HELP@"><![CDATA[ 512 <token name="@WARNINGS_HELP@"><![CDATA[
462 WARNINGS: 513 WARNINGS:
463 --warn_dist WARN_DIST 514 --warn_dist WARN_DIST
464 How far from the threshold to throw cluster warnings 515 How far from the threshold to throw cluster warnings
465 (default: 0.25) 516 (default: 0.25)
467 genomes (default: 0.98) 518 genomes (default: 0.98)
468 --warn_aln WARN_ALN Minimum aligned fraction for warnings between 519 --warn_aln WARN_ALN Minimum aligned fraction for warnings between
469 dereplicated genomes (ANIn) (default: 0.25) 520 dereplicated genomes (ANIn) (default: 0.25)
470 521
471 ]]></token> 522 ]]></token>
472
473
474 </macros> 523 </macros>