drep_dereplicate: macros.xml comparison

comparison macros.xml @ 1:ef7cd2e7bc05 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/drep commit 5e6e589002d554be180e575080e9ad66cc78ed74"

author	iuc
date	Sat, 12 Feb 2022 17:40:42 +0000
parents	8dfcdbeaeed8
children	368cb4bef9d8

comparison

equal deleted inserted replaced

-:8dfcdbeaeed8
+:ef7cd2e7bc05
+<?xml version="1.0"?>
 <macros>
-<token name="@VERSION@">2.5.4</token>
+<token name="@TOOL_VERSION@">3.2.2</token>
+<token name="@VERSION_SUFFIX@">0</token>
+<token name="@PROFILE@">20.01</token>
+<xml name="biotools">
+<xrefs>
+<xref type="bio.tools">drep</xref>
+</xrefs>
+</xml>
 <xml name="requirements">
 <requirements>
-<requirement type="package" version="@VERSION@">drep</requirement>
+<requirement type="package" version="@TOOL_VERSION@">drep</requirement>
 <yield/>
 </requirements>
 </xml>
 <xml name="citations">
 <citations>
 <citation type="doi">10.1038/ismej.2017.126</citation>
 <yield />
 </citations>
 </xml>
 <xml name="genomes">
-<param argument="--genomes" type="data" format="fasta" label="genomes fasta files" multiple="true"/>
+<param argument="--genomes" type="data" format="fasta" multiple="true" label="Genomes to filer"/>
 </xml>
+<!-- Addition of ".fasta" after names to avoid string to be read as integer
+Bug in dRep: probably fixed in next version -->
 <token name="@PREPARE_GENOMES@"><![CDATA[
 #import re
 #set $genomefiles = []
 #for $genome in $genomes
 #set $input_name = $re.sub('[^\w\-_.]', '_',str($genome.element_identifier.split('/')[-1]))
-ln -s '${genome}' '${input_name}' &&
+ln -s '${genome}' '${input_name}.fasta' &&
 $genomefiles.append($input_name)
 #end for
 ]]></token>
 <token name="@GENOMES@"><![CDATA[
 -g
 #for $genomefile in $genomefiles
-'${genomefile}'
+'${genomefile}.fasta'
 #end for
 ]]></token>
-<xml name="checkm_method">
-<param argument="--checkM_method" type="select" label="checkm method" optional="true">
-<option value="taxonomy_wf">taxonomy_wf (faster)</option>
-<option value="lineage_wf">lineage_wf (more accurate)</option>
-</param>
-</xml>
-<token name="@CHECKM_METHOD@"><![CDATA[
-#if $checkM_method:
---checkM_method $checkM_method
-#end if
-]]></token>
 <xml name="filtering_options">
-<conditional name="filter">
+<section name="filter" title="Genome filtering" expanded="true">
-<param name="set_options" type="select" label="set filtering options">
+<param argument="--length" type="integer" value="50000" label="Minimum genome length"/>
-<option value="yes">Yes</option>
+<param argument="--completeness" type="integer" value="75" min="0" max="100" label="Minimum genome completeness percent"/>
-<option value="no" selected="true">No (use --checkM_method taxonomy_wf)</option>
+<param argument="--contamination" type="integer" value="25" min="0" max="100" label="Maximum genome contamination percent"/>
+</section>
+</xml>
+<xml name="test_default_filtering_options">
+<section name="filter">
+<param name="length" value="50000"/>
+<param name="completeness" value="75"/>
+<param name="contamination" value="100"/>
+</section>
+</xml>
+<token name="@FILTER_OPTIONS@"><![CDATA[
+--length $filter.length
+--completeness $filter.completeness
+--contamination $filter.contamination
+]]></token>
+<xml name="quality_assessment_options">
+<conditional name="quality">
+<param name="source" type="select" label="Genome quality filtering" help="No checkM or quality filtering is not recommened but with bacteriophages or eukaryotes or things where checkM scoring does not work. Will only choose genomes based on length and N50.">
+<option value="checkm" selected="true">Run checkM</option>
+<option value="genomeInfo">Provide quality information on the genome (CSV file)</option>
+<option value="ignoreGenomeQuality">Don't run checkM or do any quality filtering (--ignoreGenomeQuality) - NOT RECOMMENDED!</option>
 </param>
-<when value="yes">
+<when value="checkm">
-<param argument="--length" type="integer" value="50000" label="Minimum genome length"/>
+<param argument="--checkM_method" type="select" label="CheckM method">
-<param argument="--completeness" type="integer" value="75" min="0" max="100" label="Minimum genome completeness percent"/>
+<option value="lineage_wf" selected="true">lineage_wf: Lineage-specific Workflow - quality estimates with lineage-specific markers (more accurate)</option>
-<param argument="--contamination" type="integer" value="25" min="0" max="100" label="Maximum genome contamination percent"/>
+<option value="taxonomy_wf">taxonomy_wf: Taxonomic-specific Workflow - quality estimates with taxonomic-specific markers (faster)</option>
+</param>
-<conditional name="quality">
+<param argument="--set_recursion" type="integer" optional="true" label="Increases the python recursion limit" help="NOT RECOMMENDED unless checkM is crashing due to recursion issues. Recommended to set to 2000 if needed, but setting this could crash Python"/>
-<param argument="source" type="select" label="genome quality">
+<param argument="--checkm_group_size" type="integer" value="2000" min="1" label="Number of genomes passed to checkM at a time" help="Increasing this increases RAM but makes checkM faster"/>
-<help>
---ignoreGenomeQuality is useful with
-bacteriophages or eukaryotes or things where checkM
-scoring does not work. Will only choose genomes based
-on length and N50.
-</help>
-<option value="checkm" selected="true">Run checkM</option>
-<option value="genomeInfo">User supplied genomeInfo csv file</option>
-<option value="ignoreGenomeQuality">--ignoreGenomeQuality (NOT RECOMMENDED!)</option>
-</param>
-<when value="checkm">
-<param argument="--checkM_method" type="select" label="checkm method" optional="true">
-<help>
-Using the checkm method of lineage_wf can require more than 40Gb of RAM.
-</help>
-<option value="taxonomy_wf">taxonomy_wf (faster)</option>
-<option value="lineage_wf">lineage_wf (more accurate)</option>
-</param>
-</when>
-<when value="genomeInfo">
-<param argument="--genomeInfo" type="data" format="csv" label="genomes fasta files">
-<help><![CDATA[
-A CSV dataset that must contain: [
-"genome"(history dataset name of .fasta dataset of that genome),
-"completeness"(0-100 value for completeness of the genome),
-"contamination"(0-100 value of the contamination of the genome)]
-]]></help>
-</param>
-</when>
-<when value="ignoreGenomeQuality"/>
-</conditional>
 </when>
-<when value="no"/>
+<when value="genomeInfo">
-</conditional>
+<param argument="--genomeInfo" type="data" format="csv" label="Quality information on the genomes">
-</xml>
+<help><![CDATA[
-<token name="@FILTER_OPTIONS@"><![CDATA[
+A CSV dataset that must contain: [
-#if $filter.set_options == 'yes':
+"genome"(history dataset name of .fasta dataset of that genome),
---length $filter.length
+"completeness"(0-100 value for completeness of the genome),
---completeness $filter.completeness
+"contamination"(0-100 value of the contamination of the genome)]
---contamination $filter.contamination
+]]></help>
-#if $filter.quality.source == 'checkm'
---checkM_method $filter.quality.checkM_method
-#elif $filter.quality.source == 'genomeInfo'
---genomeInfo $filter.quality.genomeInfo
-#elif $filter.quality.source == 'ignoreGenomeQuality'
---ignoreGenomeQuality
-#end if
-#else
---checkM_method taxonomy_wf
-#end if
-]]></token>
-<xml name="genome_comparison_options">
-<conditional name="genome_comparison">
-<param name="set_options" type="select" label="set genome comparison options">
-<option value="yes">Yes</option>
-<option value="no" selected="true">No</option>
-</param>
-<when value="yes">
-<param argument="--MASH_sketch" type="integer" value="1000" label="MASH sketch size"/>
-<param argument="--S_algorithm" type="select" label="Algorithm for secondary clustering comaprisons">
-<option value="ANImf" selected="true">ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions</option>
-<option value="ANIn">ANIn  = Align whole genomes with nucmer; compare aligned regions</option>
-<option value="gANI">gANI  = Identify and align ORFs; compare aligned ORFS</option>
-</param>
-<param argument="-n_PRESET" type="select" label="Presets to pass to nucmer">
-<option value="normal" selected="true">normal  = default ANIn parameters (default: normal)</option>
-<option value="tight">tight   = only align highly conserved regions</option>
 </param>
 </when>
-<when value="no"/>
+<when value="ignoreGenomeQuality"/>
 </conditional>
 </xml>
-<token name="@GENOME_COMPARISON_OPTIONS@"><![CDATA[
+<xml name="test_default_quality_assessment_options">
-#if $genome_comparison.set_options == 'yes':
+<conditional name="quality">
---MASH_sketch $genome_comparison.MASH_sketch
+<param name="source" value="checkm"/>
---S_algorithm $genome_comparison.S_algorithm
+<param name="checkM_method" value="taxonomy_wf"/>
--n_PRESET $genome_comparison.n_PRESET
+<param name="checkm_group_size" value="2000"/>
-#end if
+</conditional>
-]]></token>
+</xml>
+<token name="@QUALITY_ASSESSMENT_OPTIONS@"><![CDATA[
-<xml name="clustering_options">
+#if $quality.source == 'checkm'
+--checkM_method '$quality.checkM_method'
+#if str($quality.set_recursion) != ''
+--set_recurison $filter.set_recursion
+#end if
+--checkm_group_size $quality.checkm_group_size
+#else if $quality.source == 'genomeInfo'
+--genomeInfo '$quality.genomeInfo'
+#else if $quality.source == 'ignoreGenomeQuality'
+--ignoreGenomeQuality
+#end if
+]]></token>
+<xml name="mash">
+<param argument="--MASH_sketch" type="integer" value="1000" min="0" label="MASH sketch size"/>
+<param argument="--P_ani" type="float" value="0.9" min="0." max="1." label="ANI threshold to form primary clusters"/>
+<param argument="--multiround_primary_clustering" type='boolean' checked="false" truevalue='--multiround_primary_clustering' falsevalue='' label="Cluster each primary clunk separately and merge at the end with single linkage?" help="Decreases RAM usage and        increases speed, and the cost of a minor loss in precision and the inability to plot primary_clustering_dendrograms. Especially helpful when clustering 5000+ genomes. Will be done with single linkage clustering"/>
+<param argument="--primary_chunksize" type="integer" value="5000" min="1" label="Impacts multiround_primary_clusterings" help=" If you have more than this many genomes, process them in chunks of this size"/>
+</xml>
+<xml name="test_default_mash">
+<param name="MASH_sketch" value="1000"/>
+<param name="P_ani" value="0.9"/>
+<param name="multiround_primary_clustering" value=''/>
+<param name="primary_chunksize" value="5000"/>
+</xml>
+<token name="@MASH@"><![CDATA[
+--MASH_sketch '$comp_clust.steps.MASH_sketch'
+--P_ani $comp_clust.steps.P_ani
+$comp_clust.steps.multiround_primary_clustering
+--primary_chunksize $comp_clust.steps.primary_chunksize
+]]></token>
+<xml name="nucmer">
+<param argument="--n_PRESET" type="select" label="Presets to pass to nucmer">
+<option value="normal" selected="true">normal: default ANIn parameters</option>
+<option value="tight">tight: only align highly conserved regions</option>
+</param>
+</xml>
+<xml name="test_default_nucmer">
+<param name="n_PRESET" value="normal"/>
+</xml>
+<token name="@NUCMER@"><![CDATA[
+--n_PRESET '$comp_clust.steps.clustering.n_PRESET'
+]]></token>
+<xml name="coverage_method">
+<param argument="--coverage_method" type="select" label="Method to calculate coverage of an alignment">
+<option value="larger" selected="true">Larger = max((aligned length / genome 1), (aligned_length / genome2))</option>
+<option value="total">Total = 2*(aligned length) / (sum of total genome lengths)</option>
+</param>
+</xml>
+<xml name="test_default_coverage_method">
+<param name="coverage_method" value="larger"/>
+</xml>
+<token name="@COVERAGE_METHOD@"><![CDATA[
+--coverage_method '$comp_clust.steps.clustering.coverage_method'
+]]></token>
+<xml name="secondary_clustering">
 <conditional name="clustering">
-<param name="set_options" type="select" label="set clustering options">
+<param argument="--S_algorithm" type="select" label="Algorithm for secondary clustering comparisons">
-<option value="yes">Yes</option>
+<option value="fastANI">fastANI: Kmer-based approach - very fast</option>
-<option value="no" selected="true">No</option>
+<option value="ANImf" selected="true">ANImf: Align whole genomes with nucmer; filter alignment; compare aligned regions - RECOMMENDED</option>
+<option value="ANIn">ANIn: Align whole genomes with nucmer; compare aligned regions</option>
+<option value="gANI">gANI: Identify and align ORFs; compare aligned ORFS</option>
+<option value="goANI">Open source version of gANI; requires nsmimscan</option>
 </param>
-<when value="yes">
+<when value="fastANI">
-<param argument="--P_ani" type="float" value="0.9" min="0." max="1." label="ANI threshold to form primary (MASH) clusters"/>
+<param argument="--greedy_secondary_clustering" type='boolean' checked="false" truevalue='--greedy_secondary_clustering' falsevalue='' label="Use a heuristic to avoid pair-wise comparisons when doing secondary clustering?" help="Will be done with single linkage clustering"/>
-<param argument="--S_ani" type="float" value="0.99" min="0." max="1." label="ANI threshold to form secondary clusters"/>
+</when>
+<when value="ANImf">
-<param argument="--SkipMash" type="boolean" truevalue="--SkipMash" falsevalue="" checked="false" label="Skip MASH clustering, just do secondary clustering on all genomes"/>
+<expand macro="nucmer"/>
-<param argument="--SkipSecondary" type="boolean" truevalue="--SkipSecondary" falsevalue="" checked="false" label="Skip secondary clustering, just perform MASH clustering"/>
+<expand macro="coverage_method"/>
-<param argument="--cov_thresh" type="float" value="0.1" min="0." max="1." label="Minmum level of overlap between genomes when doing secondary comparisons"/>
+</when>
-<param argument="--coverage_method" type="select" label="Method to calculate coverage of an alignment">
+<when value="ANIn">
-<help>(for ANIn/ANImf only; gANI can only do larger method)</help>
+<expand macro="nucmer"/>
-<option value="larger" selected="true">arger  = max((aligned length / genome 1), (aligned_length / genome2))</option>
+<expand macro="coverage_method"/>
-<option value="total">total   = 2*(aligned length) / (sum of total genome lengths)</option>
+</when>
+<when value="gANI"/>
+<when value="goANI"/>
+</conditional>
+<param argument="--S_ani" type="float" value="0.99" min="0." max="1." label="ANI threshold to form secondary clusters"/>
+<param argument="--cov_thresh" type="float" value="0.1" min="0." max="1." label="Minmum level of overlap between genomes when doing secondary comparisons"/>
+</xml>
+<xml name="test_default_secondary_clustering">
+<conditional name="clustering">
+<param name="S_algorithm" value="ANImf"/>
+<expand macro="test_default_nucmer"/>
+<expand macro="test_default_coverage_method"/>
+</conditional>
+<param name="S_ani" value="0.99"/>
+<param name="cov_thresh" value="0.1"/>
+</xml>
+<token name="@SECONDARY_CLUSTERING@"><![CDATA[
+--S_algorithm '$comp_clust.steps.clustering.S_algorithm'
+#if $comp_clust.steps.clustering.S_algorithm == 'fastANI'
+$comp_clust.steps.clustering.greedy_secondary_clustering
+#else if $comp_clust.steps.clustering.S_algorithm == 'ANImf'
+@NUCMER@
+@COVERAGE_METHOD@
+#else if $comp_clust.steps.clustering.S_algorithm == 'ANIn'
+@NUCMER@
+@COVERAGE_METHOD@
+#end if
+--S_ani $comp_clust.steps.S_ani
+--cov_thresh $comp_clust.steps.cov_thresh
+]]></token>
+<xml name="comparison_clustering_options">
+<section name="comp_clust" title="Genome comparison and clustering" expanded="false">
+<conditional name="steps">
+<param name="select" type="select" label="Steps in genome comparison">
+<option value="default" selected="true">Default: Run MASH clustering and a secondary clustering</option>
+<option value="SkipMash">Skip MASH clustering, just do secondary clustering on all genomes</option>
+<option value="SkipSecondary">Skip secondary clustering, just perform MASH clustering</option>
 </param>
-<param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes">
+<when value="default">
-<help>(passed to  scipy.cluster.hierarchy.linkage)</help>
+<expand macro="mash"/>
-<option value="average" selected="true">average</option>
+<expand macro="secondary_clustering"/>
-</param>
+</when>
-</when>
+<when value="SkipMash">
-<when value="no"/>
+<expand macro="secondary_clustering"/>
-</conditional>
+</when>
-</xml>
+<when value="SkipSecondary">
-<token name="@CLUSTERING_OPTIONS@"><![CDATA[
+<expand macro="mash"/>
-#if $clustering.set_options == 'yes':
+</when>
---P_ani $clustering.P_ani
+</conditional>
---S_ani $clustering.S_ani
+<param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes" help="Passed to scipy.cluster.hierarchy.linkage">
-$clustering.SkipMash
+<option value="average" selected="true">average</option>
-$clustering.SkipSecondary
+<option value="ward">ward</option>
---cov_thresh $clustering.cov_thresh
+<option value="single">single</option>
---coverage_method $clustering.coverage_method
+<option value="median">median</option>
---clusterAlg $clustering.clusterAlg
+<option value="centroid">centroid</option>
-#end if
+<option value="weighted">weighted</option>
+</param>
+<param argument="--run_tertiary_clustering" type='boolean' checked="false" truevalue='--run_tertiary_clustering' falsevalue='' label="Run an additional round of clustering on the final genome set?" help="This is especially useful when greedy clustering is performed and/or to handle cases where similar genomes end up in different primary clusters."/>
+</section>
+</xml>
+<xml name="test_default_comparison_clustering_options">
+<section name="comp_clust">
+<conditional name="steps">
+<param name="select" value="default" />
+<expand macro="test_default_mash"/>
+<expand macro="test_default_secondary_clustering"/>
+</conditional>
+<param name="clusterAlg" value="average"/>
+<param name="run_tertiary_clustering" value=''/>
+</section>
+</xml>
+<token name="@COMPARISON_CLUSTERING_OPTIONS@"><![CDATA[
+#if $comp_clust.steps.select == 'default'
+@MASH@
+@SECONDARY_CLUSTERING@
+#else if $comp_clust.steps.select == 'SkipMash'
+--SkipMash
+@SECONDARY_CLUSTERING@
+#else
+@MASH@
+--SkipSecondary
+#end if
+--clusterAlg '$comp_clust.clusterAlg'
+$comp_clust.run_tertiary_clustering
 ]]></token>
 <xml name="scoring_options">
-<conditional name="scoring">
+<section name="scoring" title="Scoring criteria" expanded="false" help="Based off of the formula: A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) + F*(centrality - S_ani). With A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight; F = cent_weight">
-<param name="set_options" type="select" label="set scoring options">
+<param argument="--completeness_weight" type="float" value="1" label="Completeness weight"/>
-<option value="yes">Yes</option>
+<param argument="--contamination_weight" type="float" value="5" label="Contamination weight"/>
-<option value="no" selected="true">No</option>
+<param argument="--strain_heterogeneity_weight" type="float" value="1" min="0." max="1." label="Strain heterogeneity weight"/>
-</param>
+<param argument="--N50_weight" type="float" value=".5" label="Weight of log(genome N50)"/>
-<when value="yes">
+<param argument="--size_weight" type="float" value="0" label="Weight of log(genome size)"/>
-<param argument="--completeness_weight" type="float" value="1" label="completeness weight">
+<param argument="--centrality_weight" type="float" value="1" label="Weight of (centrality - S_ani)"/>
-<help>
+</section>
-Based off of the formula:
+</xml>
-A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size)
+<xml name="test_default_scoring_options">
-A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight;
+<section name="scoring">
-</help>
+<param name="completeness_weight" value="1"/>
-</param>
+<param name="contamination_weight" value="5"/>
-<param argument="--contamination_weight" type="float" value="5" label="contamination weight"/>
+<param name="strain_heterogeneity_weight" value="1"/>
-<param argument="--strain_heterogeneity_weight" type="float" value="1" min="0." max="1." label="strain heterogeneity weight"/>
+<param name="N50_weight" value=".5" />
-<param argument="--N50_weight" type="float" value=".5" label="weight of log(genome N50)"/>
+<param name="size_weight" value="0"/>
-<param argument="--size_weight" type="float" value="0" label="weight of log(genome size)"/>
+<param name="centrality_weight" value="1"/>
-</when>
+</section>
-<when value="no"/>
-</conditional>
 </xml>
 <token name="@SCORING_OPTIONS@"><![CDATA[
-#if $scoring.set_options == 'yes':
+--completeness_weight $scoring.completeness_weight
---completeness_weight $scoring.completeness_weight
+--contamination_weight $scoring.contamination_weight
---contamination_weight $scoring.contamination_weight
+--strain_heterogeneity_weight $scoring.strain_heterogeneity_weight
---strain_heterogeneity_weight $scoring.strain_heterogeneity_weight
+--N50_weight $scoring.N50_weight
---N50_weight $scoring.N50_weight
+--size_weight $scoring.size_weight
---size_weight $scoring.size_weight
+--centrality_weight $scoring.centrality_weight
-#end if
+]]></token>
-]]></token>
+<xml name="warning_options">
-<xml name="taxonomy_options">
+<section name="warning" title="Warnings" expanded="false">
-<conditional name="taxonomy">
+<param argument="--warn_dist" type="float" value="0.25" min="0" max="1" label="How far from the threshold to throw cluster warnings"/>
-<param name="set_options" type="select" label="generate taxonomy information">
+<param argument="--warn_sim" type="float" value="0.98" min="0" max="1" label="Similarity threshold for warnings between dereplicated genomes"/>
-<option value="yes">Yes</option>
+<param argument="--warn_aln" type="float" value="0.25" min="0" max="1" label="Minimum aligned fraction for warnings between dereplicated genomes (ANIn)"/>
-<option value="no" selected="true">No</option>
+</section>
-</param>
+</xml>
-<when value="yes">
+<xml name="test_default_warning_options">
-<param argument="--tax_method" type="select" label="Method of determining taxonomy">
+<section name="warning">
-<help>(for ANIn/ANImf only; gANI can only do larger method)</help>
+<param name="warn_dist" value="0.25"/>
-<option value="percent" selected="true">percent = The most descriptive taxonimic level with at least (per) hits</option>
+<param name="warn_sim" value="0.98"/>
-<option value="max">max = The centrifuge taxonomic level with the most overall hits</option>
+<param name="warn_aln" value="0.25"/>
-</param>
+</section>
-<param argument="--percent" type="float" value="50" min="0" max="100" label="minimum percent for percent method"/>
-<param argument="--cent_index" type="data" format="" label="centrifuge index"/>
-</when>
-<when value="no"/>
-</conditional>
-</xml>
-<token name="@TAXONOMY_OPTIONS@"><![CDATA[
-#if $taxonomy.set_options == 'yes':
---run_tax
---tax_method $taxonomy.tax_method
---percent $taxonomy.percent
---cent_index $taxonomy.cent_index
-#end if
-]]></token>
-<xml name="warning_options">
-<conditional name="warning">
-<param name="set_options" type="select" label="set warning options">
-<option value="yes">Yes</option>
-<option value="no" selected="true">No</option>
-</param>
-<when value="yes">
-<param argument="--warn_dist" type="float" value="0.25" min="0" max="1" label="How far from the threshold to throw cluster warnings"/>
-<param argument="--warn_sim" type="float" value="0.98" min="0" max="1" label="Similarity threshold for warnings between dereplicated genomes"/>
-<param argument="--warn_aln" type="float" value="0.25" min="0" max="1" label="Minimum aligned fraction for warnings between dereplicated genomes (ANIn)"/>
-</when>
-<when value="no"/>
-</conditional>
 </xml>
 <token name="@WARNING_OPTIONS@"><![CDATA[
-#if $warning.set_options == 'yes':
+--warn_dist $warning.warn_dist
---warn_dist $warning.warn_dist
+--warn_sim $warning.warn_sim
---warn_sim $warning.warn_sim
+--warn_aln $warning.warn_aln
---warn_aln $warning.warn_aln
-#end if
 ]]></token>
 <xml name="select_outputs">
 <param name="select_outputs" type="select" multiple="true" optional="false" label="Select outputs">
 <option value="log" selected="true">log</option>
 <option value="Winning_genomes">Winning_genomes.pdf</option>
 <option value="Widb">Widb.csv</option>
 <option value="Chdb">Chdb.tsv</option>
 </expand>
 </xml>
+<xml name="test_default_select_drep_outputs">
-<xml name="common_outputs">
+<param name="select_outputs" value="log,warnings,Primary_clustering_dendrogram,Clustering_scatterplots,Cluster_scoring,Winning_genomes,Widb" />
+</xml>
+<xml name="test_default_select_outputs">
+<param name="select_outputs" value="log,warnings,Primary_clustering_dendrogram,Clustering_scatterplots" />
+</xml>
+<xml name="common_outputs">
 <data name="log" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/logger.log">
 <filter>'log' in select_outputs or not select_outputs</filter>
 </data>
 <data name="warnings" format="txt" label="${tool.name} on ${on_string}: Warnings" from_work_dir="outdir/log/warnings.txt">
 <filter>'warnings' in select_outputs</filter>
 </data>
 <data name="Clustering_scatterplots" format="pdf" label="${tool.name} on ${on_string}: Clustering_scatterplots.pdf" from_work_dir="outdir/figures/Clustering_scatterplots.pdf">
 <filter>'Clustering_scatterplots' in select_outputs</filter>
 </data>
 </xml>
 <xml name="drep_outputs">
 <expand macro="common_outputs"/>
 <data name="Cluster_scoring" format="pdf" label="${tool.name} on ${on_string}: Cluster_scoring.pdf" from_work_dir="outdir/figures/Cluster_scoring.pdf">
 <filter>'Cluster_scoring' in select_outputs</filter>
 </data>
 </data>
 <data name="Chdb" format="tabular" label="${tool.name} on ${on_string}: Chdb.tsv" from_work_dir="outdir/data/checkM/checkM_outdir/Chdb.tsv">
 <filter>'Chdb' in select_outputs</filter>
 </data>
 </xml>
+<xml name="test_string_inputs">
+<param name="genomes" ftype="fasta" value="Enterococcus_casseliflavus_EC20.fasta,Enterococcus_faecalis_T2.fna,Enterococcus_faecalis_TX0104.fa"/>
-<xml name="test_defaults_log">
+</xml>
-<test>
+<xml name="test_integer_inputs">
-<param name="genomes" ftype="fasta" value="Enterococcus_casseliflavus_EC20.fasta,Enterococcus_faecalis_T2.fna,Enterococcus_faecalis_TX0104.fa"/>
+<param name="genomes" ftype="fasta" value="001,002,003"/>
-<output name="log">
+</xml>
-<assert_contents>
+<xml name="test_log_output">
-<yield/>
+<output name="log">
-</assert_contents>
+<assert_contents>
-</output>
+<yield/>
-</test>
+</assert_contents>
-</xml>
+</output>
+</xml>
 <token name="@GENOMES_HELP@"><![CDATA[
 I/O PARAMETERS:
 -g [GENOMES [GENOMES ...]], --genomes [GENOMES [GENOMES ...]]
 genomes to cluster in .fasta format
 (default: None)
 ]]></token>
 <token name="@FILTERING_HELP@"><![CDATA[
 FILTERING OPTIONS:
 -l LENGTH, --length LENGTH
 Minimum genome length
 (default: 50000)
 scoring does not work. Will only choose genomes based
 on length and N50 (default: False)
 ]]></token>
 <token name="@GENOME_COMPARISON_HELP@"><![CDATA[
 GENOME COMPARISON PARAMETERS:
 -ms MASH_SKETCH, --MASH_sketch MASH_SKETCH
 MASH sketch size (default: 1000)
 Presets to pass to nucmer
 tight   = only align highly conserved regions
 normal  = default ANIn parameters (default: normal)
 ]]></token>
 <token name="@CLUSTERING_HELP@"><![CDATA[
 CLUSTERING PARAMETERS:
 -pa P_ANI, --P_ani P_ANI
 ANI threshold to form primary (MASH) clusters
 (default: 0.9)
 --clusterAlg CLUSTERALG
 Algorithm used to cluster genomes (passed to
 scipy.cluster.hierarchy.linkage (default: average)
 ]]></token>
 <token name="@SCORING_HELP@"><![CDATA[
 SCORING CRITERIA
 Based off of the formula:
 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size)
 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight:
 -comW COMPLETENESS_WEIGHT, --completeness_weight COMPLETENESS_WEIGHT
 completeness weight (default: 1)
 -sizeW SIZE_WEIGHT, --size_weight SIZE_WEIGHT
 weight of log(genome size) (default: 0)
 ]]></token>
 <token name="@TAXONOMY_HELP@"><![CDATA[
 TAXONOMY:
 --run_tax             generate taxonomy information (Tdb)
 (default: False)
 path to centrifuge index (for example,
 /home/mattolm/download/centrifuge/indices/b+h+v
 (default: None)
 ]]></token>
 <token name="@WARNINGS_HELP@"><![CDATA[
 WARNINGS:
 --warn_dist WARN_DIST
 How far from the threshold to throw cluster warnings
 (default: 0.25)
 genomes (default: 0.98)
 --warn_aln WARN_ALN   Minimum aligned fraction for warnings between
 dereplicated genomes (ANIn) (default: 0.25)
 ]]></token>
 </macros>

Mercurial > repos > iuc > drep_dereplicate

comparison macros.xml @ 1:ef7cd2e7bc05 draft