Mercurial > repos > iuc > drep_dereplicate
comparison macros.xml @ 1:ef7cd2e7bc05 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/drep commit 5e6e589002d554be180e575080e9ad66cc78ed74"
author | iuc |
---|---|
date | Sat, 12 Feb 2022 17:40:42 +0000 |
parents | 8dfcdbeaeed8 |
children | 368cb4bef9d8 |
comparison
equal
deleted
inserted
replaced
0:8dfcdbeaeed8 | 1:ef7cd2e7bc05 |
---|---|
1 <?xml version="1.0"?> | |
1 <macros> | 2 <macros> |
2 <token name="@VERSION@">2.5.4</token> | 3 <token name="@TOOL_VERSION@">3.2.2</token> |
4 <token name="@VERSION_SUFFIX@">0</token> | |
5 <token name="@PROFILE@">20.01</token> | |
6 <xml name="biotools"> | |
7 <xrefs> | |
8 <xref type="bio.tools">drep</xref> | |
9 </xrefs> | |
10 </xml> | |
3 <xml name="requirements"> | 11 <xml name="requirements"> |
4 <requirements> | 12 <requirements> |
5 <requirement type="package" version="@VERSION@">drep</requirement> | 13 <requirement type="package" version="@TOOL_VERSION@">drep</requirement> |
6 <yield/> | 14 <yield/> |
7 </requirements> | 15 </requirements> |
8 </xml> | 16 </xml> |
9 <xml name="citations"> | 17 <xml name="citations"> |
10 <citations> | 18 <citations> |
11 <citation type="doi">10.1038/ismej.2017.126</citation> | 19 <citation type="doi">10.1038/ismej.2017.126</citation> |
12 <yield /> | 20 <yield /> |
13 </citations> | 21 </citations> |
14 </xml> | 22 </xml> |
15 | 23 |
16 | |
17 <xml name="genomes"> | 24 <xml name="genomes"> |
18 <param argument="--genomes" type="data" format="fasta" label="genomes fasta files" multiple="true"/> | 25 <param argument="--genomes" type="data" format="fasta" multiple="true" label="Genomes to filer"/> |
19 </xml> | 26 </xml> |
27 | |
28 <!-- Addition of ".fasta" after names to avoid string to be read as integer | |
29 Bug in dRep: probably fixed in next version --> | |
20 <token name="@PREPARE_GENOMES@"><![CDATA[ | 30 <token name="@PREPARE_GENOMES@"><![CDATA[ |
21 #import re | 31 #import re |
22 #set $genomefiles = [] | 32 #set $genomefiles = [] |
23 #for $genome in $genomes | 33 #for $genome in $genomes |
24 #set $input_name = $re.sub('[^\w\-_.]', '_',str($genome.element_identifier.split('/')[-1])) | 34 #set $input_name = $re.sub('[^\w\-_.]', '_',str($genome.element_identifier.split('/')[-1])) |
25 ln -s '${genome}' '${input_name}' && | 35 ln -s '${genome}' '${input_name}.fasta' && |
26 $genomefiles.append($input_name) | 36 $genomefiles.append($input_name) |
27 #end for | 37 #end for |
28 ]]></token> | 38 ]]></token> |
29 <token name="@GENOMES@"><![CDATA[ | 39 <token name="@GENOMES@"><![CDATA[ |
30 -g | 40 -g |
31 #for $genomefile in $genomefiles | 41 #for $genomefile in $genomefiles |
32 '${genomefile}' | 42 '${genomefile}.fasta' |
33 #end for | 43 #end for |
34 ]]></token> | 44 ]]></token> |
35 | |
36 | |
37 <xml name="checkm_method"> | |
38 <param argument="--checkM_method" type="select" label="checkm method" optional="true"> | |
39 <option value="taxonomy_wf">taxonomy_wf (faster)</option> | |
40 <option value="lineage_wf">lineage_wf (more accurate)</option> | |
41 </param> | |
42 </xml> | |
43 <token name="@CHECKM_METHOD@"><![CDATA[ | |
44 #if $checkM_method: | |
45 --checkM_method $checkM_method | |
46 #end if | |
47 ]]></token> | |
48 | 45 |
49 <xml name="filtering_options"> | 46 <xml name="filtering_options"> |
50 <conditional name="filter"> | 47 <section name="filter" title="Genome filtering" expanded="true"> |
51 <param name="set_options" type="select" label="set filtering options"> | 48 <param argument="--length" type="integer" value="50000" label="Minimum genome length"/> |
52 <option value="yes">Yes</option> | 49 <param argument="--completeness" type="integer" value="75" min="0" max="100" label="Minimum genome completeness percent"/> |
53 <option value="no" selected="true">No (use --checkM_method taxonomy_wf)</option> | 50 <param argument="--contamination" type="integer" value="25" min="0" max="100" label="Maximum genome contamination percent"/> |
51 </section> | |
52 </xml> | |
53 <xml name="test_default_filtering_options"> | |
54 <section name="filter"> | |
55 <param name="length" value="50000"/> | |
56 <param name="completeness" value="75"/> | |
57 <param name="contamination" value="100"/> | |
58 </section> | |
59 </xml> | |
60 <token name="@FILTER_OPTIONS@"><![CDATA[ | |
61 --length $filter.length | |
62 --completeness $filter.completeness | |
63 --contamination $filter.contamination | |
64 ]]></token> | |
65 | |
66 <xml name="quality_assessment_options"> | |
67 <conditional name="quality"> | |
68 <param name="source" type="select" label="Genome quality filtering" help="No checkM or quality filtering is not recommened but with bacteriophages or eukaryotes or things where checkM scoring does not work. Will only choose genomes based on length and N50."> | |
69 <option value="checkm" selected="true">Run checkM</option> | |
70 <option value="genomeInfo">Provide quality information on the genome (CSV file)</option> | |
71 <option value="ignoreGenomeQuality">Don't run checkM or do any quality filtering (--ignoreGenomeQuality) - NOT RECOMMENDED!</option> | |
54 </param> | 72 </param> |
55 <when value="yes"> | 73 <when value="checkm"> |
56 <param argument="--length" type="integer" value="50000" label="Minimum genome length"/> | 74 <param argument="--checkM_method" type="select" label="CheckM method"> |
57 <param argument="--completeness" type="integer" value="75" min="0" max="100" label="Minimum genome completeness percent"/> | 75 <option value="lineage_wf" selected="true">lineage_wf: Lineage-specific Workflow - quality estimates with lineage-specific markers (more accurate)</option> |
58 <param argument="--contamination" type="integer" value="25" min="0" max="100" label="Maximum genome contamination percent"/> | 76 <option value="taxonomy_wf">taxonomy_wf: Taxonomic-specific Workflow - quality estimates with taxonomic-specific markers (faster)</option> |
59 | 77 </param> |
60 <conditional name="quality"> | 78 <param argument="--set_recursion" type="integer" optional="true" label="Increases the python recursion limit" help="NOT RECOMMENDED unless checkM is crashing due to recursion issues. Recommended to set to 2000 if needed, but setting this could crash Python"/> |
61 <param argument="source" type="select" label="genome quality"> | 79 <param argument="--checkm_group_size" type="integer" value="2000" min="1" label="Number of genomes passed to checkM at a time" help="Increasing this increases RAM but makes checkM faster"/> |
62 <help> | |
63 --ignoreGenomeQuality is useful with | |
64 bacteriophages or eukaryotes or things where checkM | |
65 scoring does not work. Will only choose genomes based | |
66 on length and N50. | |
67 </help> | |
68 <option value="checkm" selected="true">Run checkM</option> | |
69 <option value="genomeInfo">User supplied genomeInfo csv file</option> | |
70 <option value="ignoreGenomeQuality">--ignoreGenomeQuality (NOT RECOMMENDED!)</option> | |
71 </param> | |
72 <when value="checkm"> | |
73 <param argument="--checkM_method" type="select" label="checkm method" optional="true"> | |
74 <help> | |
75 Using the checkm method of lineage_wf can require more than 40Gb of RAM. | |
76 </help> | |
77 <option value="taxonomy_wf">taxonomy_wf (faster)</option> | |
78 <option value="lineage_wf">lineage_wf (more accurate)</option> | |
79 </param> | |
80 </when> | |
81 <when value="genomeInfo"> | |
82 <param argument="--genomeInfo" type="data" format="csv" label="genomes fasta files"> | |
83 <help><![CDATA[ | |
84 A CSV dataset that must contain: [ | |
85 "genome"(history dataset name of .fasta dataset of that genome), | |
86 "completeness"(0-100 value for completeness of the genome), | |
87 "contamination"(0-100 value of the contamination of the genome)] | |
88 ]]></help> | |
89 </param> | |
90 </when> | |
91 <when value="ignoreGenomeQuality"/> | |
92 </conditional> | |
93 </when> | 80 </when> |
94 <when value="no"/> | 81 <when value="genomeInfo"> |
95 </conditional> | 82 <param argument="--genomeInfo" type="data" format="csv" label="Quality information on the genomes"> |
96 </xml> | 83 <help><![CDATA[ |
97 <token name="@FILTER_OPTIONS@"><![CDATA[ | 84 A CSV dataset that must contain: [ |
98 #if $filter.set_options == 'yes': | 85 "genome"(history dataset name of .fasta dataset of that genome), |
99 --length $filter.length | 86 "completeness"(0-100 value for completeness of the genome), |
100 --completeness $filter.completeness | 87 "contamination"(0-100 value of the contamination of the genome)] |
101 --contamination $filter.contamination | 88 ]]></help> |
102 #if $filter.quality.source == 'checkm' | |
103 --checkM_method $filter.quality.checkM_method | |
104 #elif $filter.quality.source == 'genomeInfo' | |
105 --genomeInfo $filter.quality.genomeInfo | |
106 #elif $filter.quality.source == 'ignoreGenomeQuality' | |
107 --ignoreGenomeQuality | |
108 #end if | |
109 #else | |
110 --checkM_method taxonomy_wf | |
111 #end if | |
112 ]]></token> | |
113 | |
114 <xml name="genome_comparison_options"> | |
115 <conditional name="genome_comparison"> | |
116 <param name="set_options" type="select" label="set genome comparison options"> | |
117 <option value="yes">Yes</option> | |
118 <option value="no" selected="true">No</option> | |
119 </param> | |
120 <when value="yes"> | |
121 <param argument="--MASH_sketch" type="integer" value="1000" label="MASH sketch size"/> | |
122 <param argument="--S_algorithm" type="select" label="Algorithm for secondary clustering comaprisons"> | |
123 <option value="ANImf" selected="true">ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions</option> | |
124 <option value="ANIn">ANIn = Align whole genomes with nucmer; compare aligned regions</option> | |
125 <option value="gANI">gANI = Identify and align ORFs; compare aligned ORFS</option> | |
126 </param> | |
127 <param argument="-n_PRESET" type="select" label="Presets to pass to nucmer"> | |
128 <option value="normal" selected="true">normal = default ANIn parameters (default: normal)</option> | |
129 <option value="tight">tight = only align highly conserved regions</option> | |
130 </param> | 89 </param> |
131 </when> | 90 </when> |
132 <when value="no"/> | 91 <when value="ignoreGenomeQuality"/> |
133 </conditional> | 92 </conditional> |
134 </xml> | 93 </xml> |
135 <token name="@GENOME_COMPARISON_OPTIONS@"><![CDATA[ | 94 <xml name="test_default_quality_assessment_options"> |
136 #if $genome_comparison.set_options == 'yes': | 95 <conditional name="quality"> |
137 --MASH_sketch $genome_comparison.MASH_sketch | 96 <param name="source" value="checkm"/> |
138 --S_algorithm $genome_comparison.S_algorithm | 97 <param name="checkM_method" value="taxonomy_wf"/> |
139 -n_PRESET $genome_comparison.n_PRESET | 98 <param name="checkm_group_size" value="2000"/> |
140 #end if | 99 </conditional> |
141 ]]></token> | 100 </xml> |
142 | 101 <token name="@QUALITY_ASSESSMENT_OPTIONS@"><![CDATA[ |
143 <xml name="clustering_options"> | 102 #if $quality.source == 'checkm' |
103 --checkM_method '$quality.checkM_method' | |
104 #if str($quality.set_recursion) != '' | |
105 --set_recurison $filter.set_recursion | |
106 #end if | |
107 --checkm_group_size $quality.checkm_group_size | |
108 #else if $quality.source == 'genomeInfo' | |
109 --genomeInfo '$quality.genomeInfo' | |
110 #else if $quality.source == 'ignoreGenomeQuality' | |
111 --ignoreGenomeQuality | |
112 #end if | |
113 ]]></token> | |
114 | |
115 <xml name="mash"> | |
116 <param argument="--MASH_sketch" type="integer" value="1000" min="0" label="MASH sketch size"/> | |
117 <param argument="--P_ani" type="float" value="0.9" min="0." max="1." label="ANI threshold to form primary clusters"/> | |
118 <param argument="--multiround_primary_clustering" type='boolean' checked="false" truevalue='--multiround_primary_clustering' falsevalue='' label="Cluster each primary clunk separately and merge at the end with single linkage?" help="Decreases RAM usage and increases speed, and the cost of a minor loss in precision and the inability to plot primary_clustering_dendrograms. Especially helpful when clustering 5000+ genomes. Will be done with single linkage clustering"/> | |
119 <param argument="--primary_chunksize" type="integer" value="5000" min="1" label="Impacts multiround_primary_clusterings" help=" If you have more than this many genomes, process them in chunks of this size"/> | |
120 </xml> | |
121 <xml name="test_default_mash"> | |
122 <param name="MASH_sketch" value="1000"/> | |
123 <param name="P_ani" value="0.9"/> | |
124 <param name="multiround_primary_clustering" value=''/> | |
125 <param name="primary_chunksize" value="5000"/> | |
126 </xml> | |
127 <token name="@MASH@"><![CDATA[ | |
128 --MASH_sketch '$comp_clust.steps.MASH_sketch' | |
129 --P_ani $comp_clust.steps.P_ani | |
130 $comp_clust.steps.multiround_primary_clustering | |
131 --primary_chunksize $comp_clust.steps.primary_chunksize | |
132 ]]></token> | |
133 | |
134 <xml name="nucmer"> | |
135 <param argument="--n_PRESET" type="select" label="Presets to pass to nucmer"> | |
136 <option value="normal" selected="true">normal: default ANIn parameters</option> | |
137 <option value="tight">tight: only align highly conserved regions</option> | |
138 </param> | |
139 </xml> | |
140 <xml name="test_default_nucmer"> | |
141 <param name="n_PRESET" value="normal"/> | |
142 </xml> | |
143 <token name="@NUCMER@"><![CDATA[ | |
144 --n_PRESET '$comp_clust.steps.clustering.n_PRESET' | |
145 ]]></token> | |
146 | |
147 <xml name="coverage_method"> | |
148 <param argument="--coverage_method" type="select" label="Method to calculate coverage of an alignment"> | |
149 <option value="larger" selected="true">Larger = max((aligned length / genome 1), (aligned_length / genome2))</option> | |
150 <option value="total">Total = 2*(aligned length) / (sum of total genome lengths)</option> | |
151 </param> | |
152 </xml> | |
153 <xml name="test_default_coverage_method"> | |
154 <param name="coverage_method" value="larger"/> | |
155 </xml> | |
156 <token name="@COVERAGE_METHOD@"><![CDATA[ | |
157 --coverage_method '$comp_clust.steps.clustering.coverage_method' | |
158 ]]></token> | |
159 | |
160 <xml name="secondary_clustering"> | |
144 <conditional name="clustering"> | 161 <conditional name="clustering"> |
145 <param name="set_options" type="select" label="set clustering options"> | 162 <param argument="--S_algorithm" type="select" label="Algorithm for secondary clustering comparisons"> |
146 <option value="yes">Yes</option> | 163 <option value="fastANI">fastANI: Kmer-based approach - very fast</option> |
147 <option value="no" selected="true">No</option> | 164 <option value="ANImf" selected="true">ANImf: Align whole genomes with nucmer; filter alignment; compare aligned regions - RECOMMENDED</option> |
165 <option value="ANIn">ANIn: Align whole genomes with nucmer; compare aligned regions</option> | |
166 <option value="gANI">gANI: Identify and align ORFs; compare aligned ORFS</option> | |
167 <option value="goANI">Open source version of gANI; requires nsmimscan</option> | |
148 </param> | 168 </param> |
149 <when value="yes"> | 169 <when value="fastANI"> |
150 <param argument="--P_ani" type="float" value="0.9" min="0." max="1." label="ANI threshold to form primary (MASH) clusters"/> | 170 <param argument="--greedy_secondary_clustering" type='boolean' checked="false" truevalue='--greedy_secondary_clustering' falsevalue='' label="Use a heuristic to avoid pair-wise comparisons when doing secondary clustering?" help="Will be done with single linkage clustering"/> |
151 <param argument="--S_ani" type="float" value="0.99" min="0." max="1." label="ANI threshold to form secondary clusters"/> | 171 </when> |
152 | 172 <when value="ANImf"> |
153 <param argument="--SkipMash" type="boolean" truevalue="--SkipMash" falsevalue="" checked="false" label="Skip MASH clustering, just do secondary clustering on all genomes"/> | 173 <expand macro="nucmer"/> |
154 <param argument="--SkipSecondary" type="boolean" truevalue="--SkipSecondary" falsevalue="" checked="false" label="Skip secondary clustering, just perform MASH clustering"/> | 174 <expand macro="coverage_method"/> |
155 <param argument="--cov_thresh" type="float" value="0.1" min="0." max="1." label="Minmum level of overlap between genomes when doing secondary comparisons"/> | 175 </when> |
156 <param argument="--coverage_method" type="select" label="Method to calculate coverage of an alignment"> | 176 <when value="ANIn"> |
157 <help>(for ANIn/ANImf only; gANI can only do larger method)</help> | 177 <expand macro="nucmer"/> |
158 <option value="larger" selected="true">arger = max((aligned length / genome 1), (aligned_length / genome2))</option> | 178 <expand macro="coverage_method"/> |
159 <option value="total">total = 2*(aligned length) / (sum of total genome lengths)</option> | 179 </when> |
180 <when value="gANI"/> | |
181 <when value="goANI"/> | |
182 </conditional> | |
183 <param argument="--S_ani" type="float" value="0.99" min="0." max="1." label="ANI threshold to form secondary clusters"/> | |
184 <param argument="--cov_thresh" type="float" value="0.1" min="0." max="1." label="Minmum level of overlap between genomes when doing secondary comparisons"/> | |
185 </xml> | |
186 <xml name="test_default_secondary_clustering"> | |
187 <conditional name="clustering"> | |
188 <param name="S_algorithm" value="ANImf"/> | |
189 <expand macro="test_default_nucmer"/> | |
190 <expand macro="test_default_coverage_method"/> | |
191 </conditional> | |
192 <param name="S_ani" value="0.99"/> | |
193 <param name="cov_thresh" value="0.1"/> | |
194 </xml> | |
195 <token name="@SECONDARY_CLUSTERING@"><![CDATA[ | |
196 --S_algorithm '$comp_clust.steps.clustering.S_algorithm' | |
197 #if $comp_clust.steps.clustering.S_algorithm == 'fastANI' | |
198 $comp_clust.steps.clustering.greedy_secondary_clustering | |
199 #else if $comp_clust.steps.clustering.S_algorithm == 'ANImf' | |
200 @NUCMER@ | |
201 @COVERAGE_METHOD@ | |
202 #else if $comp_clust.steps.clustering.S_algorithm == 'ANIn' | |
203 @NUCMER@ | |
204 @COVERAGE_METHOD@ | |
205 #end if | |
206 --S_ani $comp_clust.steps.S_ani | |
207 --cov_thresh $comp_clust.steps.cov_thresh | |
208 ]]></token> | |
209 | |
210 <xml name="comparison_clustering_options"> | |
211 <section name="comp_clust" title="Genome comparison and clustering" expanded="false"> | |
212 <conditional name="steps"> | |
213 <param name="select" type="select" label="Steps in genome comparison"> | |
214 <option value="default" selected="true">Default: Run MASH clustering and a secondary clustering</option> | |
215 <option value="SkipMash">Skip MASH clustering, just do secondary clustering on all genomes</option> | |
216 <option value="SkipSecondary">Skip secondary clustering, just perform MASH clustering</option> | |
160 </param> | 217 </param> |
161 <param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes"> | 218 <when value="default"> |
162 <help>(passed to scipy.cluster.hierarchy.linkage)</help> | 219 <expand macro="mash"/> |
163 <option value="average" selected="true">average</option> | 220 <expand macro="secondary_clustering"/> |
164 </param> | 221 </when> |
165 </when> | 222 <when value="SkipMash"> |
166 <when value="no"/> | 223 <expand macro="secondary_clustering"/> |
167 </conditional> | 224 </when> |
168 </xml> | 225 <when value="SkipSecondary"> |
169 <token name="@CLUSTERING_OPTIONS@"><![CDATA[ | 226 <expand macro="mash"/> |
170 #if $clustering.set_options == 'yes': | 227 </when> |
171 --P_ani $clustering.P_ani | 228 </conditional> |
172 --S_ani $clustering.S_ani | 229 <param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes" help="Passed to scipy.cluster.hierarchy.linkage"> |
173 $clustering.SkipMash | 230 <option value="average" selected="true">average</option> |
174 $clustering.SkipSecondary | 231 <option value="ward">ward</option> |
175 --cov_thresh $clustering.cov_thresh | 232 <option value="single">single</option> |
176 --coverage_method $clustering.coverage_method | 233 <option value="median">median</option> |
177 --clusterAlg $clustering.clusterAlg | 234 <option value="centroid">centroid</option> |
178 #end if | 235 <option value="weighted">weighted</option> |
236 </param> | |
237 <param argument="--run_tertiary_clustering" type='boolean' checked="false" truevalue='--run_tertiary_clustering' falsevalue='' label="Run an additional round of clustering on the final genome set?" help="This is especially useful when greedy clustering is performed and/or to handle cases where similar genomes end up in different primary clusters."/> | |
238 </section> | |
239 </xml> | |
240 <xml name="test_default_comparison_clustering_options"> | |
241 <section name="comp_clust"> | |
242 <conditional name="steps"> | |
243 <param name="select" value="default" /> | |
244 <expand macro="test_default_mash"/> | |
245 <expand macro="test_default_secondary_clustering"/> | |
246 </conditional> | |
247 <param name="clusterAlg" value="average"/> | |
248 <param name="run_tertiary_clustering" value=''/> | |
249 </section> | |
250 </xml> | |
251 <token name="@COMPARISON_CLUSTERING_OPTIONS@"><![CDATA[ | |
252 #if $comp_clust.steps.select == 'default' | |
253 @MASH@ | |
254 @SECONDARY_CLUSTERING@ | |
255 #else if $comp_clust.steps.select == 'SkipMash' | |
256 --SkipMash | |
257 @SECONDARY_CLUSTERING@ | |
258 #else | |
259 @MASH@ | |
260 --SkipSecondary | |
261 #end if | |
262 --clusterAlg '$comp_clust.clusterAlg' | |
263 $comp_clust.run_tertiary_clustering | |
179 ]]></token> | 264 ]]></token> |
180 | 265 |
181 <xml name="scoring_options"> | 266 <xml name="scoring_options"> |
182 <conditional name="scoring"> | 267 <section name="scoring" title="Scoring criteria" expanded="false" help="Based off of the formula: A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) + F*(centrality - S_ani). With A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight; F = cent_weight"> |
183 <param name="set_options" type="select" label="set scoring options"> | 268 <param argument="--completeness_weight" type="float" value="1" label="Completeness weight"/> |
184 <option value="yes">Yes</option> | 269 <param argument="--contamination_weight" type="float" value="5" label="Contamination weight"/> |
185 <option value="no" selected="true">No</option> | 270 <param argument="--strain_heterogeneity_weight" type="float" value="1" min="0." max="1." label="Strain heterogeneity weight"/> |
186 </param> | 271 <param argument="--N50_weight" type="float" value=".5" label="Weight of log(genome N50)"/> |
187 <when value="yes"> | 272 <param argument="--size_weight" type="float" value="0" label="Weight of log(genome size)"/> |
188 <param argument="--completeness_weight" type="float" value="1" label="completeness weight"> | 273 <param argument="--centrality_weight" type="float" value="1" label="Weight of (centrality - S_ani)"/> |
189 <help> | 274 </section> |
190 Based off of the formula: | 275 </xml> |
191 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) | 276 <xml name="test_default_scoring_options"> |
192 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight; | 277 <section name="scoring"> |
193 </help> | 278 <param name="completeness_weight" value="1"/> |
194 </param> | 279 <param name="contamination_weight" value="5"/> |
195 <param argument="--contamination_weight" type="float" value="5" label="contamination weight"/> | 280 <param name="strain_heterogeneity_weight" value="1"/> |
196 <param argument="--strain_heterogeneity_weight" type="float" value="1" min="0." max="1." label="strain heterogeneity weight"/> | 281 <param name="N50_weight" value=".5" /> |
197 <param argument="--N50_weight" type="float" value=".5" label="weight of log(genome N50)"/> | 282 <param name="size_weight" value="0"/> |
198 <param argument="--size_weight" type="float" value="0" label="weight of log(genome size)"/> | 283 <param name="centrality_weight" value="1"/> |
199 </when> | 284 </section> |
200 <when value="no"/> | |
201 </conditional> | |
202 </xml> | 285 </xml> |
203 <token name="@SCORING_OPTIONS@"><![CDATA[ | 286 <token name="@SCORING_OPTIONS@"><![CDATA[ |
204 #if $scoring.set_options == 'yes': | 287 --completeness_weight $scoring.completeness_weight |
205 --completeness_weight $scoring.completeness_weight | 288 --contamination_weight $scoring.contamination_weight |
206 --contamination_weight $scoring.contamination_weight | 289 --strain_heterogeneity_weight $scoring.strain_heterogeneity_weight |
207 --strain_heterogeneity_weight $scoring.strain_heterogeneity_weight | 290 --N50_weight $scoring.N50_weight |
208 --N50_weight $scoring.N50_weight | 291 --size_weight $scoring.size_weight |
209 --size_weight $scoring.size_weight | 292 --centrality_weight $scoring.centrality_weight |
210 #end if | 293 ]]></token> |
211 ]]></token> | 294 |
212 | 295 <xml name="warning_options"> |
213 <xml name="taxonomy_options"> | 296 <section name="warning" title="Warnings" expanded="false"> |
214 <conditional name="taxonomy"> | 297 <param argument="--warn_dist" type="float" value="0.25" min="0" max="1" label="How far from the threshold to throw cluster warnings"/> |
215 <param name="set_options" type="select" label="generate taxonomy information"> | 298 <param argument="--warn_sim" type="float" value="0.98" min="0" max="1" label="Similarity threshold for warnings between dereplicated genomes"/> |
216 <option value="yes">Yes</option> | 299 <param argument="--warn_aln" type="float" value="0.25" min="0" max="1" label="Minimum aligned fraction for warnings between dereplicated genomes (ANIn)"/> |
217 <option value="no" selected="true">No</option> | 300 </section> |
218 </param> | 301 </xml> |
219 <when value="yes"> | 302 <xml name="test_default_warning_options"> |
220 <param argument="--tax_method" type="select" label="Method of determining taxonomy"> | 303 <section name="warning"> |
221 <help>(for ANIn/ANImf only; gANI can only do larger method)</help> | 304 <param name="warn_dist" value="0.25"/> |
222 <option value="percent" selected="true">percent = The most descriptive taxonimic level with at least (per) hits</option> | 305 <param name="warn_sim" value="0.98"/> |
223 <option value="max">max = The centrifuge taxonomic level with the most overall hits</option> | 306 <param name="warn_aln" value="0.25"/> |
224 </param> | 307 </section> |
225 <param argument="--percent" type="float" value="50" min="0" max="100" label="minimum percent for percent method"/> | |
226 <param argument="--cent_index" type="data" format="" label="centrifuge index"/> | |
227 </when> | |
228 <when value="no"/> | |
229 </conditional> | |
230 </xml> | |
231 <token name="@TAXONOMY_OPTIONS@"><![CDATA[ | |
232 #if $taxonomy.set_options == 'yes': | |
233 --run_tax | |
234 --tax_method $taxonomy.tax_method | |
235 --percent $taxonomy.percent | |
236 --cent_index $taxonomy.cent_index | |
237 #end if | |
238 ]]></token> | |
239 | |
240 <xml name="warning_options"> | |
241 <conditional name="warning"> | |
242 <param name="set_options" type="select" label="set warning options"> | |
243 <option value="yes">Yes</option> | |
244 <option value="no" selected="true">No</option> | |
245 </param> | |
246 <when value="yes"> | |
247 <param argument="--warn_dist" type="float" value="0.25" min="0" max="1" label="How far from the threshold to throw cluster warnings"/> | |
248 <param argument="--warn_sim" type="float" value="0.98" min="0" max="1" label="Similarity threshold for warnings between dereplicated genomes"/> | |
249 <param argument="--warn_aln" type="float" value="0.25" min="0" max="1" label="Minimum aligned fraction for warnings between dereplicated genomes (ANIn)"/> | |
250 </when> | |
251 <when value="no"/> | |
252 </conditional> | |
253 </xml> | 308 </xml> |
254 <token name="@WARNING_OPTIONS@"><![CDATA[ | 309 <token name="@WARNING_OPTIONS@"><![CDATA[ |
255 #if $warning.set_options == 'yes': | 310 --warn_dist $warning.warn_dist |
256 --warn_dist $warning.warn_dist | 311 --warn_sim $warning.warn_sim |
257 --warn_sim $warning.warn_sim | 312 --warn_aln $warning.warn_aln |
258 --warn_aln $warning.warn_aln | |
259 #end if | |
260 ]]></token> | 313 ]]></token> |
261 | 314 |
262 <xml name="select_outputs"> | 315 <xml name="select_outputs"> |
263 <param name="select_outputs" type="select" multiple="true" optional="false" label="Select outputs"> | 316 <param name="select_outputs" type="select" multiple="true" optional="false" label="Select outputs"> |
264 <option value="log" selected="true">log</option> | 317 <option value="log" selected="true">log</option> |
276 <option value="Winning_genomes">Winning_genomes.pdf</option> | 329 <option value="Winning_genomes">Winning_genomes.pdf</option> |
277 <option value="Widb">Widb.csv</option> | 330 <option value="Widb">Widb.csv</option> |
278 <option value="Chdb">Chdb.tsv</option> | 331 <option value="Chdb">Chdb.tsv</option> |
279 </expand> | 332 </expand> |
280 </xml> | 333 </xml> |
281 | 334 <xml name="test_default_select_drep_outputs"> |
282 <xml name="common_outputs"> | 335 <param name="select_outputs" value="log,warnings,Primary_clustering_dendrogram,Clustering_scatterplots,Cluster_scoring,Winning_genomes,Widb" /> |
336 </xml> | |
337 <xml name="test_default_select_outputs"> | |
338 <param name="select_outputs" value="log,warnings,Primary_clustering_dendrogram,Clustering_scatterplots" /> | |
339 </xml> | |
340 | |
341 <xml name="common_outputs"> | |
283 <data name="log" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/logger.log"> | 342 <data name="log" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/logger.log"> |
284 <filter>'log' in select_outputs or not select_outputs</filter> | 343 <filter>'log' in select_outputs or not select_outputs</filter> |
285 </data> | 344 </data> |
286 <data name="warnings" format="txt" label="${tool.name} on ${on_string}: Warnings" from_work_dir="outdir/log/warnings.txt"> | 345 <data name="warnings" format="txt" label="${tool.name} on ${on_string}: Warnings" from_work_dir="outdir/log/warnings.txt"> |
287 <filter>'warnings' in select_outputs</filter> | 346 <filter>'warnings' in select_outputs</filter> |
297 </data> | 356 </data> |
298 <data name="Clustering_scatterplots" format="pdf" label="${tool.name} on ${on_string}: Clustering_scatterplots.pdf" from_work_dir="outdir/figures/Clustering_scatterplots.pdf"> | 357 <data name="Clustering_scatterplots" format="pdf" label="${tool.name} on ${on_string}: Clustering_scatterplots.pdf" from_work_dir="outdir/figures/Clustering_scatterplots.pdf"> |
299 <filter>'Clustering_scatterplots' in select_outputs</filter> | 358 <filter>'Clustering_scatterplots' in select_outputs</filter> |
300 </data> | 359 </data> |
301 </xml> | 360 </xml> |
302 | |
303 | |
304 <xml name="drep_outputs"> | 361 <xml name="drep_outputs"> |
305 <expand macro="common_outputs"/> | 362 <expand macro="common_outputs"/> |
306 <data name="Cluster_scoring" format="pdf" label="${tool.name} on ${on_string}: Cluster_scoring.pdf" from_work_dir="outdir/figures/Cluster_scoring.pdf"> | 363 <data name="Cluster_scoring" format="pdf" label="${tool.name} on ${on_string}: Cluster_scoring.pdf" from_work_dir="outdir/figures/Cluster_scoring.pdf"> |
307 <filter>'Cluster_scoring' in select_outputs</filter> | 364 <filter>'Cluster_scoring' in select_outputs</filter> |
308 </data> | 365 </data> |
314 </data> | 371 </data> |
315 <data name="Chdb" format="tabular" label="${tool.name} on ${on_string}: Chdb.tsv" from_work_dir="outdir/data/checkM/checkM_outdir/Chdb.tsv"> | 372 <data name="Chdb" format="tabular" label="${tool.name} on ${on_string}: Chdb.tsv" from_work_dir="outdir/data/checkM/checkM_outdir/Chdb.tsv"> |
316 <filter>'Chdb' in select_outputs</filter> | 373 <filter>'Chdb' in select_outputs</filter> |
317 </data> | 374 </data> |
318 </xml> | 375 </xml> |
319 | 376 <xml name="test_string_inputs"> |
320 | 377 <param name="genomes" ftype="fasta" value="Enterococcus_casseliflavus_EC20.fasta,Enterococcus_faecalis_T2.fna,Enterococcus_faecalis_TX0104.fa"/> |
321 <xml name="test_defaults_log"> | 378 </xml> |
322 <test> | 379 <xml name="test_integer_inputs"> |
323 <param name="genomes" ftype="fasta" value="Enterococcus_casseliflavus_EC20.fasta,Enterococcus_faecalis_T2.fna,Enterococcus_faecalis_TX0104.fa"/> | 380 <param name="genomes" ftype="fasta" value="001,002,003"/> |
324 <output name="log"> | 381 </xml> |
325 <assert_contents> | 382 <xml name="test_log_output"> |
326 <yield/> | 383 <output name="log"> |
327 </assert_contents> | 384 <assert_contents> |
328 </output> | 385 <yield/> |
329 </test> | 386 </assert_contents> |
330 </xml> | 387 </output> |
331 | 388 </xml> |
332 <token name="@GENOMES_HELP@"><![CDATA[ | 389 <token name="@GENOMES_HELP@"><![CDATA[ |
333 I/O PARAMETERS: | 390 I/O PARAMETERS: |
334 -g [GENOMES [GENOMES ...]], --genomes [GENOMES [GENOMES ...]] | 391 -g [GENOMES [GENOMES ...]], --genomes [GENOMES [GENOMES ...]] |
335 genomes to cluster in .fasta format | 392 genomes to cluster in .fasta format |
336 (default: None) | 393 (default: None) |
337 | 394 |
338 | 395 |
339 ]]></token> | 396 ]]></token> |
340 | |
341 <token name="@FILTERING_HELP@"><![CDATA[ | 397 <token name="@FILTERING_HELP@"><![CDATA[ |
342 FILTERING OPTIONS: | 398 FILTERING OPTIONS: |
343 -l LENGTH, --length LENGTH | 399 -l LENGTH, --length LENGTH |
344 Minimum genome length | 400 Minimum genome length |
345 (default: 50000) | 401 (default: 50000) |
362 scoring does not work. Will only choose genomes based | 418 scoring does not work. Will only choose genomes based |
363 on length and N50 (default: False) | 419 on length and N50 (default: False) |
364 | 420 |
365 | 421 |
366 ]]></token> | 422 ]]></token> |
367 | |
368 <token name="@GENOME_COMPARISON_HELP@"><![CDATA[ | 423 <token name="@GENOME_COMPARISON_HELP@"><![CDATA[ |
369 GENOME COMPARISON PARAMETERS: | 424 GENOME COMPARISON PARAMETERS: |
370 -ms MASH_SKETCH, --MASH_sketch MASH_SKETCH | 425 -ms MASH_SKETCH, --MASH_sketch MASH_SKETCH |
371 MASH sketch size (default: 1000) | 426 MASH sketch size (default: 1000) |
372 | 427 |
381 Presets to pass to nucmer | 436 Presets to pass to nucmer |
382 tight = only align highly conserved regions | 437 tight = only align highly conserved regions |
383 normal = default ANIn parameters (default: normal) | 438 normal = default ANIn parameters (default: normal) |
384 | 439 |
385 ]]></token> | 440 ]]></token> |
386 | |
387 <token name="@CLUSTERING_HELP@"><![CDATA[ | 441 <token name="@CLUSTERING_HELP@"><![CDATA[ |
388 CLUSTERING PARAMETERS: | 442 CLUSTERING PARAMETERS: |
389 -pa P_ANI, --P_ani P_ANI | 443 -pa P_ANI, --P_ani P_ANI |
390 ANI threshold to form primary (MASH) clusters | 444 ANI threshold to form primary (MASH) clusters |
391 (default: 0.9) | 445 (default: 0.9) |
411 --clusterAlg CLUSTERALG | 465 --clusterAlg CLUSTERALG |
412 Algorithm used to cluster genomes (passed to | 466 Algorithm used to cluster genomes (passed to |
413 scipy.cluster.hierarchy.linkage (default: average) | 467 scipy.cluster.hierarchy.linkage (default: average) |
414 | 468 |
415 ]]></token> | 469 ]]></token> |
416 | |
417 <token name="@SCORING_HELP@"><![CDATA[ | 470 <token name="@SCORING_HELP@"><![CDATA[ |
418 SCORING CRITERIA | 471 SCORING CRITERIA |
419 Based off of the formula: | 472 Based off of the formula: |
420 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) | 473 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) |
421 | 474 |
422 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight: | 475 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight: |
423 -comW COMPLETENESS_WEIGHT, --completeness_weight COMPLETENESS_WEIGHT | 476 -comW COMPLETENESS_WEIGHT, --completeness_weight COMPLETENESS_WEIGHT |
424 completeness weight (default: 1) | 477 completeness weight (default: 1) |
431 -sizeW SIZE_WEIGHT, --size_weight SIZE_WEIGHT | 484 -sizeW SIZE_WEIGHT, --size_weight SIZE_WEIGHT |
432 weight of log(genome size) (default: 0) | 485 weight of log(genome size) (default: 0) |
433 | 486 |
434 | 487 |
435 ]]></token> | 488 ]]></token> |
436 | |
437 <token name="@TAXONOMY_HELP@"><![CDATA[ | 489 <token name="@TAXONOMY_HELP@"><![CDATA[ |
438 TAXONOMY: | 490 TAXONOMY: |
439 --run_tax generate taxonomy information (Tdb) | 491 --run_tax generate taxonomy information (Tdb) |
440 (default: False) | 492 (default: False) |
441 | 493 |
455 path to centrifuge index (for example, | 507 path to centrifuge index (for example, |
456 /home/mattolm/download/centrifuge/indices/b+h+v | 508 /home/mattolm/download/centrifuge/indices/b+h+v |
457 (default: None) | 509 (default: None) |
458 | 510 |
459 ]]></token> | 511 ]]></token> |
460 | |
461 <token name="@WARNINGS_HELP@"><![CDATA[ | 512 <token name="@WARNINGS_HELP@"><![CDATA[ |
462 WARNINGS: | 513 WARNINGS: |
463 --warn_dist WARN_DIST | 514 --warn_dist WARN_DIST |
464 How far from the threshold to throw cluster warnings | 515 How far from the threshold to throw cluster warnings |
465 (default: 0.25) | 516 (default: 0.25) |
467 genomes (default: 0.98) | 518 genomes (default: 0.98) |
468 --warn_aln WARN_ALN Minimum aligned fraction for warnings between | 519 --warn_aln WARN_ALN Minimum aligned fraction for warnings between |
469 dereplicated genomes (ANIn) (default: 0.25) | 520 dereplicated genomes (ANIn) (default: 0.25) |
470 | 521 |
471 ]]></token> | 522 ]]></token> |
472 | |
473 | |
474 </macros> | 523 </macros> |