Mercurial > repos > iuc > drep_dereplicate
diff macros.xml @ 0:8dfcdbeaeed8 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/drep commit 8fa5ff35b45c2b046c7f4800410cf39cb89a299a"
author | iuc |
---|---|
date | Tue, 05 May 2020 06:12:47 -0400 |
parents | |
children | ef7cd2e7bc05 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Tue May 05 06:12:47 2020 -0400 @@ -0,0 +1,474 @@ +<macros> + <token name="@VERSION@">2.5.4</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@VERSION@">drep</requirement> + <yield/> + </requirements> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1038/ismej.2017.126</citation> + <yield /> + </citations> + </xml> + + + <xml name="genomes"> + <param argument="--genomes" type="data" format="fasta" label="genomes fasta files" multiple="true"/> + </xml> + <token name="@PREPARE_GENOMES@"><![CDATA[ + #import re + #set $genomefiles = [] + #for $genome in $genomes + #set $input_name = $re.sub('[^\w\-_.]', '_',str($genome.element_identifier.split('/')[-1])) + ln -s '${genome}' '${input_name}' && + $genomefiles.append($input_name) + #end for +]]></token> + <token name="@GENOMES@"><![CDATA[ + -g + #for $genomefile in $genomefiles + '${genomefile}' + #end for +]]></token> + + + <xml name="checkm_method"> + <param argument="--checkM_method" type="select" label="checkm method" optional="true"> + <option value="taxonomy_wf">taxonomy_wf (faster)</option> + <option value="lineage_wf">lineage_wf (more accurate)</option> + </param> + </xml> + <token name="@CHECKM_METHOD@"><![CDATA[ + #if $checkM_method: + --checkM_method $checkM_method + #end if +]]></token> + + <xml name="filtering_options"> + <conditional name="filter"> + <param name="set_options" type="select" label="set filtering options"> + <option value="yes">Yes</option> + <option value="no" selected="true">No (use --checkM_method taxonomy_wf)</option> + </param> + <when value="yes"> + <param argument="--length" type="integer" value="50000" label="Minimum genome length"/> + <param argument="--completeness" type="integer" value="75" min="0" max="100" label="Minimum genome completeness percent"/> + <param argument="--contamination" type="integer" value="25" min="0" max="100" label="Maximum genome contamination percent"/> + + <conditional name="quality"> + <param argument="source" type="select" label="genome quality"> + <help> + --ignoreGenomeQuality is useful with + bacteriophages or eukaryotes or things where checkM + scoring does not work. Will only choose genomes based + on length and N50. + </help> + <option value="checkm" selected="true">Run checkM</option> + <option value="genomeInfo">User supplied genomeInfo csv file</option> + <option value="ignoreGenomeQuality">--ignoreGenomeQuality (NOT RECOMMENDED!)</option> + </param> + <when value="checkm"> + <param argument="--checkM_method" type="select" label="checkm method" optional="true"> + <help> + Using the checkm method of lineage_wf can require more than 40Gb of RAM. + </help> + <option value="taxonomy_wf">taxonomy_wf (faster)</option> + <option value="lineage_wf">lineage_wf (more accurate)</option> + </param> + </when> + <when value="genomeInfo"> + <param argument="--genomeInfo" type="data" format="csv" label="genomes fasta files"> + <help><![CDATA[ + A CSV dataset that must contain: [ + "genome"(history dataset name of .fasta dataset of that genome), + "completeness"(0-100 value for completeness of the genome), + "contamination"(0-100 value of the contamination of the genome)] + ]]></help> + </param> + </when> + <when value="ignoreGenomeQuality"/> + </conditional> + </when> + <when value="no"/> + </conditional> + </xml> + <token name="@FILTER_OPTIONS@"><![CDATA[ + #if $filter.set_options == 'yes': + --length $filter.length + --completeness $filter.completeness + --contamination $filter.contamination + #if $filter.quality.source == 'checkm' + --checkM_method $filter.quality.checkM_method + #elif $filter.quality.source == 'genomeInfo' + --genomeInfo $filter.quality.genomeInfo + #elif $filter.quality.source == 'ignoreGenomeQuality' + --ignoreGenomeQuality + #end if + #else + --checkM_method taxonomy_wf + #end if +]]></token> + + <xml name="genome_comparison_options"> + <conditional name="genome_comparison"> + <param name="set_options" type="select" label="set genome comparison options"> + <option value="yes">Yes</option> + <option value="no" selected="true">No</option> + </param> + <when value="yes"> + <param argument="--MASH_sketch" type="integer" value="1000" label="MASH sketch size"/> + <param argument="--S_algorithm" type="select" label="Algorithm for secondary clustering comaprisons"> + <option value="ANImf" selected="true">ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions</option> + <option value="ANIn">ANIn = Align whole genomes with nucmer; compare aligned regions</option> + <option value="gANI">gANI = Identify and align ORFs; compare aligned ORFS</option> + </param> + <param argument="-n_PRESET" type="select" label="Presets to pass to nucmer"> + <option value="normal" selected="true">normal = default ANIn parameters (default: normal)</option> + <option value="tight">tight = only align highly conserved regions</option> + </param> + </when> + <when value="no"/> + </conditional> + </xml> + <token name="@GENOME_COMPARISON_OPTIONS@"><![CDATA[ + #if $genome_comparison.set_options == 'yes': + --MASH_sketch $genome_comparison.MASH_sketch + --S_algorithm $genome_comparison.S_algorithm + -n_PRESET $genome_comparison.n_PRESET + #end if +]]></token> + + <xml name="clustering_options"> + <conditional name="clustering"> + <param name="set_options" type="select" label="set clustering options"> + <option value="yes">Yes</option> + <option value="no" selected="true">No</option> + </param> + <when value="yes"> + <param argument="--P_ani" type="float" value="0.9" min="0." max="1." label="ANI threshold to form primary (MASH) clusters"/> + <param argument="--S_ani" type="float" value="0.99" min="0." max="1." label="ANI threshold to form secondary clusters"/> + + <param argument="--SkipMash" type="boolean" truevalue="--SkipMash" falsevalue="" checked="false" label="Skip MASH clustering, just do secondary clustering on all genomes"/> + <param argument="--SkipSecondary" type="boolean" truevalue="--SkipSecondary" falsevalue="" checked="false" label="Skip secondary clustering, just perform MASH clustering"/> + <param argument="--cov_thresh" type="float" value="0.1" min="0." max="1." label="Minmum level of overlap between genomes when doing secondary comparisons"/> + <param argument="--coverage_method" type="select" label="Method to calculate coverage of an alignment"> + <help>(for ANIn/ANImf only; gANI can only do larger method)</help> + <option value="larger" selected="true">arger = max((aligned length / genome 1), (aligned_length / genome2))</option> + <option value="total">total = 2*(aligned length) / (sum of total genome lengths)</option> + </param> + <param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes"> + <help>(passed to scipy.cluster.hierarchy.linkage)</help> + <option value="average" selected="true">average</option> + </param> + </when> + <when value="no"/> + </conditional> + </xml> + <token name="@CLUSTERING_OPTIONS@"><![CDATA[ + #if $clustering.set_options == 'yes': + --P_ani $clustering.P_ani + --S_ani $clustering.S_ani + $clustering.SkipMash + $clustering.SkipSecondary + --cov_thresh $clustering.cov_thresh + --coverage_method $clustering.coverage_method + --clusterAlg $clustering.clusterAlg + #end if +]]></token> + + <xml name="scoring_options"> + <conditional name="scoring"> + <param name="set_options" type="select" label="set scoring options"> + <option value="yes">Yes</option> + <option value="no" selected="true">No</option> + </param> + <when value="yes"> + <param argument="--completeness_weight" type="float" value="1" label="completeness weight"> + <help> +Based off of the formula: +A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) +A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight; + </help> + </param> + <param argument="--contamination_weight" type="float" value="5" label="contamination weight"/> + <param argument="--strain_heterogeneity_weight" type="float" value="1" min="0." max="1." label="strain heterogeneity weight"/> + <param argument="--N50_weight" type="float" value=".5" label="weight of log(genome N50)"/> + <param argument="--size_weight" type="float" value="0" label="weight of log(genome size)"/> + </when> + <when value="no"/> + </conditional> + </xml> + <token name="@SCORING_OPTIONS@"><![CDATA[ + #if $scoring.set_options == 'yes': + --completeness_weight $scoring.completeness_weight + --contamination_weight $scoring.contamination_weight + --strain_heterogeneity_weight $scoring.strain_heterogeneity_weight + --N50_weight $scoring.N50_weight + --size_weight $scoring.size_weight + #end if +]]></token> + + <xml name="taxonomy_options"> + <conditional name="taxonomy"> + <param name="set_options" type="select" label="generate taxonomy information"> + <option value="yes">Yes</option> + <option value="no" selected="true">No</option> + </param> + <when value="yes"> + <param argument="--tax_method" type="select" label="Method of determining taxonomy"> + <help>(for ANIn/ANImf only; gANI can only do larger method)</help> + <option value="percent" selected="true">percent = The most descriptive taxonimic level with at least (per) hits</option> + <option value="max">max = The centrifuge taxonomic level with the most overall hits</option> + </param> + <param argument="--percent" type="float" value="50" min="0" max="100" label="minimum percent for percent method"/> + <param argument="--cent_index" type="data" format="" label="centrifuge index"/> + </when> + <when value="no"/> + </conditional> + </xml> + <token name="@TAXONOMY_OPTIONS@"><![CDATA[ + #if $taxonomy.set_options == 'yes': + --run_tax + --tax_method $taxonomy.tax_method + --percent $taxonomy.percent + --cent_index $taxonomy.cent_index + #end if +]]></token> + + <xml name="warning_options"> + <conditional name="warning"> + <param name="set_options" type="select" label="set warning options"> + <option value="yes">Yes</option> + <option value="no" selected="true">No</option> + </param> + <when value="yes"> + <param argument="--warn_dist" type="float" value="0.25" min="0" max="1" label="How far from the threshold to throw cluster warnings"/> + <param argument="--warn_sim" type="float" value="0.98" min="0" max="1" label="Similarity threshold for warnings between dereplicated genomes"/> + <param argument="--warn_aln" type="float" value="0.25" min="0" max="1" label="Minimum aligned fraction for warnings between dereplicated genomes (ANIn)"/> + </when> + <when value="no"/> + </conditional> + </xml> + <token name="@WARNING_OPTIONS@"><![CDATA[ + #if $warning.set_options == 'yes': + --warn_dist $warning.warn_dist + --warn_sim $warning.warn_sim + --warn_aln $warning.warn_aln + #end if +]]></token> + + <xml name="select_outputs"> + <param name="select_outputs" type="select" multiple="true" optional="false" label="Select outputs"> + <option value="log" selected="true">log</option> + <option value="warnings" selected="true">Warnings</option> + <option value="Primary_clustering_dendrogram" selected="true">Primary_clustering_dendrogram.pdf</option> + <option value="Secondary_clustering_dendrograms">Secondary_clustering_dendrograms.pdf</option> + <option value="Secondary_clustering_MDS">Secondary_clustering_MDS.pdf</option> + <option value="Clustering_scatterplots" selected="true">Clustering_scatterplots.pdf</option> + <yield/> + </param> + </xml> + <xml name="select_drep_outputs"> + <expand macro="select_outputs"> + <option value="Cluster_scoring">Cluster_scoring.pdf</option> + <option value="Winning_genomes">Winning_genomes.pdf</option> + <option value="Widb">Widb.csv</option> + <option value="Chdb">Chdb.tsv</option> + </expand> + </xml> + + <xml name="common_outputs"> + <data name="log" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/logger.log"> + <filter>'log' in select_outputs or not select_outputs</filter> + </data> + <data name="warnings" format="txt" label="${tool.name} on ${on_string}: Warnings" from_work_dir="outdir/log/warnings.txt"> + <filter>'warnings' in select_outputs</filter> + </data> + <data name="Primary_clustering_dendrogram" format="pdf" label="${tool.name} on ${on_string}: Primary_clustering_dendrogram.pdf" from_work_dir="outdir/figures/Primary_clustering_dendrogram.pdf"> + <filter>'Primary_clustering_dendrogram' in select_outputs</filter> + </data> + <data name="Secondary_clustering_dendrograms" format="pdf" label="${tool.name} on ${on_string}: Secondary_clustering_dendrograms.pdf" from_work_dir="outdir/figures/Secondary_clustering_dendrograms.pdf"> + <filter>'Secondary_clustering_dendrograms' in select_outputs</filter> + </data> + <data name="Secondary_clustering_MDS" format="pdf" label="${tool.name} on ${on_string}: Secondary_clustering_MDS.pdf" from_work_dir="outdir/figures/Secondary_clustering_MDS.pdf"> + <filter>'Secondary_clustering_MDS' in select_outputs</filter> + </data> + <data name="Clustering_scatterplots" format="pdf" label="${tool.name} on ${on_string}: Clustering_scatterplots.pdf" from_work_dir="outdir/figures/Clustering_scatterplots.pdf"> + <filter>'Clustering_scatterplots' in select_outputs</filter> + </data> + </xml> + + + <xml name="drep_outputs"> + <expand macro="common_outputs"/> + <data name="Cluster_scoring" format="pdf" label="${tool.name} on ${on_string}: Cluster_scoring.pdf" from_work_dir="outdir/figures/Cluster_scoring.pdf"> + <filter>'Cluster_scoring' in select_outputs</filter> + </data> + <data name="Winning_genomes" format="pdf" label="${tool.name} on ${on_string}: Winning_genomes.pdf" from_work_dir="outdir/figures/Winning_genomes.pdf"> + <filter>'Winning_genomes' in select_outputs</filter> + </data> + <data name="Widb" format="csv" label="${tool.name} on ${on_string}: Widb.csv" from_work_dir="outdir/data_tables/Widb.csv"> + <filter>'Widb' in select_outputs</filter> + </data> + <data name="Chdb" format="tabular" label="${tool.name} on ${on_string}: Chdb.tsv" from_work_dir="outdir/data/checkM/checkM_outdir/Chdb.tsv"> + <filter>'Chdb' in select_outputs</filter> + </data> + </xml> + + + <xml name="test_defaults_log"> + <test> + <param name="genomes" ftype="fasta" value="Enterococcus_casseliflavus_EC20.fasta,Enterococcus_faecalis_T2.fna,Enterococcus_faecalis_TX0104.fa"/> + <output name="log"> + <assert_contents> + <yield/> + </assert_contents> + </output> + </test> + </xml> + + <token name="@GENOMES_HELP@"><![CDATA[ +I/O PARAMETERS: + -g [GENOMES [GENOMES ...]], --genomes [GENOMES [GENOMES ...]] + genomes to cluster in .fasta format + (default: None) + + +]]></token> + + <token name="@FILTERING_HELP@"><![CDATA[ +FILTERING OPTIONS: + -l LENGTH, --length LENGTH + Minimum genome length + (default: 50000) + + + -comp COMPLETENESS, --completeness COMPLETENESS + Minumum genome completeness + (default: 75) + + + -con CONTAMINATION, --contamination CONTAMINATION + Maximum genome contamination + (default: 25) + + + --ignoreGenomeQuality + Don't run checkM or do any quality filtering. NOT + RECOMMENDED! This is useful for use with + bacteriophages or eukaryotes or things where checkM + scoring does not work. Will only choose genomes based + on length and N50 (default: False) + + +]]></token> + + <token name="@GENOME_COMPARISON_HELP@"><![CDATA[ +GENOME COMPARISON PARAMETERS: + -ms MASH_SKETCH, --MASH_sketch MASH_SKETCH + MASH sketch size (default: 1000) + + --S_algorithm {goANI,ANIn,ANImf,gANI} + Algorithm for secondary clustering comaprisons: + ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions + ANIn = Align whole genomes with nucmer; compare aligned regions + gANI = Identify and align ORFs; compare aligned ORFS + (default: ANImf) + + -n_PRESET {normal,tight} + Presets to pass to nucmer + tight = only align highly conserved regions + normal = default ANIn parameters (default: normal) + +]]></token> + + <token name="@CLUSTERING_HELP@"><![CDATA[ +CLUSTERING PARAMETERS: + -pa P_ANI, --P_ani P_ANI + ANI threshold to form primary (MASH) clusters + (default: 0.9) + -sa S_ANI, --S_ani S_ANI + ANI threshold to form secondary clusters + (default: 0.99) + + --SkipMash Skip MASH clustering, just do secondary clustering on + all genomes (default: False) + --SkipSecondary Skip secondary clustering, just perform MASH clustering + (default: False) + + -nc COV_THRESH, --cov_thresh COV_THRESH + Minmum level of overlap between genomes when doing + secondary comparisons (default: 0.1) + -cm {total,larger}, --coverage_method {total,larger} + Method to calculate coverage of an alignment + (for ANIn/ANImf only; gANI can only do larger method) + total = 2*(aligned length) / (sum of total genome lengths) + larger = max((aligned length / genome 1), (aligned_length / genome2)) + (default: larger) + + --clusterAlg CLUSTERALG + Algorithm used to cluster genomes (passed to + scipy.cluster.hierarchy.linkage (default: average) + +]]></token> + + <token name="@SCORING_HELP@"><![CDATA[ +SCORING CRITERIA +Based off of the formula: +A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) + +A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight: + -comW COMPLETENESS_WEIGHT, --completeness_weight COMPLETENESS_WEIGHT + completeness weight (default: 1) + -conW CONTAMINATION_WEIGHT, --contamination_weight CONTAMINATION_WEIGHT + contamination weight (default: 5) + -strW STRAIN_HETEROGENEITY_WEIGHT, --strain_heterogeneity_weight STRAIN_HETEROGENEITY_WEIGHT + strain heterogeneity weight (default: 1) + -N50W N50_WEIGHT, --N50_weight N50_WEIGHT + weight of log(genome N50) (default: 0.5) + -sizeW SIZE_WEIGHT, --size_weight SIZE_WEIGHT + weight of log(genome size) (default: 0) + + +]]></token> + + <token name="@TAXONOMY_HELP@"><![CDATA[ +TAXONOMY: + --run_tax generate taxonomy information (Tdb) + (default: False) + + --tax_method {percent,max} + Method of determining taxonomy + percent = The most descriptive taxonimic level with at least (per) hits + max = The centrifuge taxonomic level with the most overall hits + (default: percent) + + + -per PERCENT, --percent PERCENT + minimum percent for percent method + (default: 50) + + + --cent_index CENT_INDEX + path to centrifuge index (for example, + /home/mattolm/download/centrifuge/indices/b+h+v + (default: None) + +]]></token> + + <token name="@WARNINGS_HELP@"><![CDATA[ +WARNINGS: + --warn_dist WARN_DIST + How far from the threshold to throw cluster warnings + (default: 0.25) + --warn_sim WARN_SIM Similarity threshold for warnings between dereplicated + genomes (default: 0.98) + --warn_aln WARN_ALN Minimum aligned fraction for warnings between + dereplicated genomes (ANIn) (default: 0.25) + +]]></token> + + +</macros>