Mercurial > repos > jjohnson > drep

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/drep_compare.xml	Mon Jan 06 11:11:06 2020 -0500
@@ -0,0 +1,59 @@
+<tool id="drep_compare" name="dRep compare" version="@VERSION@.0" python_template_version="3.5">
+    <description>compare a list of genomes</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <command detect_errors="exit_code"><![CDATA[
+         @PREPARE_GENOMES@
+         dRep compare outdir
+         @GENOME_COMPARISON_OPTIONS@
+         @CLUSTERING_OPTIONS@
+         @TAXONOMY_OPTIONS@
+         @WARNING_OPTIONS@
+         @GENOMES@
+    ]]></command>
+    <inputs>
+        <expand macro="genomes"/>
+        <expand macro="genome_comparison_options"/>
+        <expand macro="clustering_options"/>
+        <expand macro="taxonomy_options"/>
+        <expand macro="warning_options"/>
+    </inputs>
+    <outputs>
+        <expand macro="common_outputs" />
+<!--
+outdir/data_tables/Cdb.csv
+outdir/data_tables/Mdb.csv
+outdir/data_tables/Ndb.csv
+outdir/data_tables/Bdb.csv
+
+        <data name="foldChange" format="tabular" label="${tool.name} on ${on_string}: BayesianFoldChangeAnalysis.tsv" from_work_dir="out/BayesianFoldChangeAnalysis.tsv">
+            <filter>'bayesian' in experiment and 'ctr' in experiment['bayesian']</filter>
+        </data>
+-->
+    </outputs>
+    <help><![CDATA[
+        TODO: Fill in help.
+
+usage: drep compare [-p PROCESSORS] [-d] [-h] [-ms MASH_SKETCH]
+                    [--S_algorithm {ANIn,goANI,ANImf,gANI}]
+                    [-n_PRESET {normal,tight}] [-pa P_ANI] [-sa S_ANI]
+                    [--SkipMash] [--SkipSecondary] [-nc COV_THRESH]
+                    [-cm {total,larger}] [--clusterAlg CLUSTERALG] [--run_tax]
+                    [--tax_method {percent,max}] [-per PERCENT]
+                    [--cent_index CENT_INDEX] [--warn_dist WARN_DIST]
+                    [--warn_sim WARN_SIM] [--warn_aln WARN_ALN]
+                    [-g [GENOMES [GENOMES ...]]]
+                    work_directory
+
+
+    @GENOMES_HELP@
+    @GENOME_COMPARISON_HELP@
+    @CLUSTERING_HELP@
+    @TAXONOMY_HELP@
+    @WARNINGS_HELP@
+
+    ]]></help>
+    <expand macro="citations" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/drep_dereplicate.xml	Mon Jan 06 11:11:06 2020 -0500
@@ -0,0 +1,66 @@
+<tool id="drep_dereplicate" name="dRep dereplicate" version="@VERSION@.0" python_template_version="3.5">
+    <description>De-replicate a list of genomes</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <command detect_errors="exit_code"><![CDATA[
+         @PREPARE_GENOMES@
+         dRep dereplicate outdir
+         @FILTER_OPTIONS@
+         @GENOME_COMPARISON_OPTIONS@
+         @CLUSTERING_OPTIONS@
+         @SCORING_OPTIONS@
+         @TAXONOMY_OPTIONS@
+         @WARNING_OPTIONS@
+         @GENOMES@
+    ]]></command>
+    <inputs>
+        <expand macro="genomes"/>
+        <expand macro="filtering_options"/>
+        <expand macro="genome_comparison_options"/>
+        <expand macro="clustering_options"/>
+        <expand macro="scoring_options"/>
+        <expand macro="taxonomy_options"/>
+        <expand macro="warning_options"/>
+    </inputs>
+    <outputs>
+        <expand macro="common_outputs" />
+        <collection name="dereplicated_genomes" type="list" label="dereplicated_genomes">
+             <discover_datasets pattern="__designation__" directory="out_drep/dereplicated_genomes" ext='fasta'/>
+        </collection>
+    </outputs>
+    <help><![CDATA[
+        TODO: Fill in help.
+
+usage: drep dereplicate [-p PROCESSORS] [-d] [-h] [-l LENGTH]
+                        [-comp COMPLETENESS] [-con CONTAMINATION]
+                        [--ignoreGenomeQuality] [-ms MASH_SKETCH]
+                        [--S_algorithm {goANI,ANIn,ANImf,gANI}]
+                        [-n_PRESET {normal,tight}] [-pa P_ANI] [-sa S_ANI]
+                        [--SkipMash] [--SkipSecondary] [-nc COV_THRESH]
+                        [-cm {total,larger}] [--clusterAlg CLUSTERALG]
+                        [-comW COMPLETENESS_WEIGHT]
+                        [-conW CONTAMINATION_WEIGHT]
+                        [-strW STRAIN_HETEROGENEITY_WEIGHT] [-N50W N50_WEIGHT]
+                        [-sizeW SIZE_WEIGHT] [--run_tax]
+                        [--tax_method {percent,max}] [-per PERCENT]
+                        [--cent_index CENT_INDEX] [--warn_dist WARN_DIST]
+                        [--warn_sim WARN_SIM] [--warn_aln WARN_ALN]
+                        [-g [GENOMES [GENOMES ...]]]
+                        [--checkM_method {taxonomy_wf,lineage_wf}]
+                        [--genomeInfo GENOMEINFO]
+                        work_directory
+
+    @GENOMES_HELP@
+    @FILTERING_HELP@
+    @GENOME_COMPARISON_HELP@
+    @CLUSTERING_HELP@
+    @SCORING_HELP@
+    @TAXONOMY_HELP@
+    @WARNINGS_HELP@
+
+
+    ]]></help>
+    <expand macro="citations" />
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Mon Jan 06 11:11:06 2020 -0500
@@ -0,0 +1,388 @@
+<macros>
+    <token name="@VERSION@">2.3.2</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@VERSION@">drep</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1038/ismej.2017.126</citation>
+            <yield />
+        </citations>
+    </xml>
+
+
+    <xml name="genomes">
+        <param argument="--genomes" type="data" format="fasta" label="genomes fasta files" multiple="true"/>
+    </xml>
+    <token name="@PREPARE_GENOMES@"><![CDATA[
+    #import re
+    #set $genomefiles = []
+    #for $genome in $genomes
+        #set $input_name = $re.sub('[^\w\-_.]', '_',str($genome.element_identifier.split('/')[-1]))
+        ln -s '${genome}' '${input_name}' &&
+        $genomefiles.append($input_name)
+    #end for
+]]></token>
+    <token name="@GENOMES@"><![CDATA[
+    -g
+    #for $genomefile in $genomefiles
+    '${genomefile}'
+    #end for
+]]></token>
+
+
+    <xml name="checkm_method">
+        <param argument="--checkM_method" type="select" label="checkm method" optional="true">
+           <option value="lineage_wf">lineage_wf (more accurate)</option>
+           <option value="taxonomy_wf">taxonomy_wf (faster)</option>
+        </param>
+    </xml>
+    <token name="@CHECKM_METHOD@"><![CDATA[
+    #if $checkM_method:
+    --checkM_method $checkM_method
+    #end if
+]]></token>
+
+    <xml name="filtering_options">
+        <conditional name="filter">
+            <param name="set_options" type="select" label="set filtering options">
+                <option value="yes">Yes</option>
+                <option value="no" selected="true">No</option>
+            </param>
+            <when value="yes">
+                <param argument="--length" type="integer" value="50000" label="Minimum genome length"/>
+                <param argument="--completeness" type="integer" value="75" min="0" max="100" label="Minimum genome completeness percent"/>
+                <param argument="--contamination" type="integer" value="25" min="0" max="100" label="Maximum genome contamination percent"/>
+
+                <conditional name="quality">
+                    <param argument="source" type="select" label="genome quality">
+                        <help>
+                            --ignoreGenomeQuality is useful with
+                            bacteriophages or eukaryotes or things where checkM
+                            scoring does not work. Will only choose genomes based
+                            on length and N50.
+                        </help>
+                        <option value="checkm" selected="true">Run checkM</option>
+                        <option value="genomeInfo">User supplied genomeInfo csv file</option>
+                        <option value="ignoreGenomeQuality">--ignoreGenomeQuality (NOT RECOMMENDED!)</option>
+                    </param>
+                    <when value="checkm">
+                        <param argument="--checkM_method" type="select" label="checkm method" optional="true">
+                            <option value="lineage_wf">lineage_wf (more accurate)</option>
+                            <option value="taxonomy_wf">taxonomy_wf (faster)</option>
+                        </param>
+                    </when>
+                    <when value="genomeInfo">
+                        <param argument="--genomeInfo" type="data" format="csv" label="genomes fasta files">
+                            <help><![CDATA[
+                            A CSV dataset that must contain: [
+                            "genome"(history dataset name of .fasta dataset of that genome),
+                            "completeness"(0-100 value for completeness of the genome),
+                            "contamination"(0-100 value of the contamination of the genome)]
+                            ]]></help>
+                        </param>
+                    </when>
+                    <when value="ignoreGenomeQuality"/>
+                </conditional>
+
+            </when>
+            <when value="no"/>
+        </conditional>
+    </xml>
+    <token name="@FILTER_OPTIONS@"><![CDATA[
+        #if $filter.set_options == 'yes':
+            --length $filter.length
+            --completeness $filter.completeness
+            --contamination $filter.contamination
+            #if $filter.quality.source == 'checkm'
+                --checkM_method $filter.quality.checkM_method
+            #elif $filter.quality.source == 'genomeInfo'
+                --genomeInfo $filter.quality.genomeInfo
+            #elif $filter.quality.source == 'ignoreGenomeQuality'
+                --ignoreGenomeQuality
+            #end if
+        #end if
+]]></token>
+
+    <xml name="genome_comparison_options">
+        <conditional name="genome_comparison">
+            <param name="set_options" type="select" label="set genome comparison options">
+                <option value="yes">Yes</option>
+                <option value="no" selected="true">No</option>
+            </param>
+            <when value="yes">
+                <param argument="--MASH_sketch" type="integer" value="1000" label="MASH sketch size"/>
+                <param argument="--S_algorithm" type="select" label="Algorithm for secondary clustering comaprisons">
+                    <option value="ANImf" selected="true">ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions</option>
+                    <option value="ANIn">ANIn  = Align whole genomes with nucmer; compare aligned regions</option>
+                    <option value="gANI">gANI  = Identify and align ORFs; compare aligned ORFS</option>
+                </param>
+                <param argument="-n_PRESET" type="select" label="Presets to pass to nucmer">
+                    <option value="normal" selected="true">normal  = default ANIn parameters (default: normal)</option>
+                    <option value="tight">tight   = only align highly conserved regions</option>
+                </param>
+            </when>
+            <when value="no"/>
+        </conditional>
+    </xml>
+    <token name="@GENOME_COMPARISON_OPTIONS@"><![CDATA[
+        #if $genome_comparison.set_options == 'yes':
+            --MASH_sketch $genome_comparison.MASH_sketch
+            --S_algorithm $genome_comparison.S_algorithm
+            -n_PRESET $genome_comparison.n_PRESET
+        #end if
+]]></token>
+
+    <xml name="clustering_options">
+        <conditional name="clustering">
+            <param name="set_options" type="select" label="set clustering options">
+                <option value="yes">Yes</option>
+                <option value="no" selected="true">No</option>
+            </param>
+            <when value="yes">
+                <param argument="--P_ani" type="float" value="0.9" min="0." max="1." label="ANI threshold to form primary (MASH) clusters"/>
+                <param argument="--S_ani" type="float" value="0.99" min="0." max="1." label="ANI threshold to form secondary clusters"/>
+
+                <param argument="--SkipMash" type="boolean" truevalue="--SkipMash" falsevalue="" checked="false" label="Skip MASH clustering, just do secondary clustering on all genomes"/>
+                <param argument="--SkipSecondary" type="boolean" truevalue="--SkipSecondary" falsevalue="" checked="false" label="Skip secondary clustering, just perform MASH clustering"/>
+                <param argument="--cov_thresh" type="float" value="0.1" min="0." max="1." label="Minmum level of overlap between genomes when doing secondary comparisons"/>
+                <param argument="--coverage_method" type="select" label="Method to calculate coverage of an alignment">
+                    <help>(for ANIn/ANImf only; gANI can only do larger method)</help>
+                    <option value="larger" selected="true">arger  = max((aligned length / genome 1), (aligned_length / genome2))</option>
+                    <option value="total">total   = 2*(aligned length) / (sum of total genome lengths)</option>
+                </param>
+                <param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes">
+                    <help>(passed to  scipy.cluster.hierarchy.linkage)</help>
+                    <option value="average" selected="true">average</option>
+                </param>
+            </when>
+            <when value="no"/>
+        </conditional>
+    </xml>
+    <token name="@CLUSTERING_OPTIONS@"><![CDATA[
+        #if $clustering.set_options == 'yes':
+            --P_ani $clustering.P_ani
+            --S_ani $clustering.S_ani
+            $clustering.SkipMash
+            $clustering.SkipSecondary
+            --cov_thresh $clustering.cov_thresh
+            --coverage_method $clustering.coverage_method
+            --clusterAlg $clustering.clusterAlg
+        #end if
+]]></token>
+
+    <xml name="scoring_options">
+        <conditional name="scoring">
+            <param name="set_options" type="select" label="set scoring options">
+                <option value="yes">Yes</option>
+                <option value="no" selected="true">No</option>
+            </param>
+            <when value="yes">
+                <param argument="--completeness_weight" type="float" value="1" label="completeness weight">
+                    <help>
+Based off of the formula:
+A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size)
+A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight;
+                    </help>
+                </param>
+                <param argument="--contamination_weight" type="float" value="5" label="contamination weight"/>
+                <param argument="--strain_heterogeneity_weight" type="float" value="1" min="0." max="1." label="strain heterogeneity weight"/>
+                <param argument="--N50_weight" type="float" value=".5" label="weight of log(genome N50)"/>
+                <param argument="--size_weight" type="float" value="0" label="weight of log(genome size)"/>
+            </when>
+            <when value="no"/>
+        </conditional>
+    </xml>
+    <token name="@SCORING_OPTIONS@"><![CDATA[
+        #if $scoring.set_options == 'yes':
+            --completeness_weight $scoring.completeness_weight
+            --contamination_weight $scoring.contamination_weight
+            --strain_heterogeneity_weight $scoring.strain_heterogeneity_weight
+            --N50_weight $scoring.N50_weight
+            --size_weight $scoring.size_weight
+        #end if
+]]></token>
+
+    <xml name="taxonomy_options">
+        <conditional name="taxonomy">
+            <param name="set_options" type="select" label="generate taxonomy information">
+                <option value="yes">Yes</option>
+                <option value="no" selected="true">No</option>
+            </param>
+            <when value="yes">
+                <param argument="--tax_method" type="select" label="Method of determining taxonomy">
+                    <help>(for ANIn/ANImf only; gANI can only do larger method)</help>
+                    <option value="percent" selected="true">percent = The most descriptive taxonimic level with at least (per) hits</option>
+                    <option value="max">max = The centrifuge taxonomic level with the most overall hits</option>
+                </param>
+                <param argument="--percent" type="float" value="50" min="0" max="100" label="minimum percent for percent method"/>
+                <param argument="--cent_index" type="data" format="" label="centrifuge index"/>
+            </when>
+            <when value="no"/>
+        </conditional>
+    </xml>
+    <token name="@TAXONOMY_OPTIONS@"><![CDATA[
+        #if $taxonomy.set_options == 'yes':
+            --run_tax
+            --tax_method $taxonomy.tax_method
+            --percent $taxonomy.percent
+            --cent_index $taxonomy.cent_index
+        #end if
+]]></token>
+
+   <xml name="warning_options">
+        <conditional name="warning">
+            <param name="set_options" type="select" label="set warning options">
+                <option value="yes">Yes</option>
+                <option value="no" selected="true">No</option>
+            </param>
+            <when value="yes">
+                <param argument="--warn_dist" type="float" value="0.25" min="0" max="1" label="How far from the threshold to throw cluster warnings"/>
+                <param argument="--warn_sim" type="float" value="0.98" min="0" max="1" label="Similarity threshold for warnings between dereplicated genomes"/>
+                <param argument="--warn_aln" type="float" value="0.25" min="0" max="1" label="Minimum aligned fraction for warnings between dereplicated genomes (ANIn)"/>
+            </when>
+            <when value="no"/>
+        </conditional>
+    </xml>
+    <token name="@WARNING_OPTIONS@"><![CDATA[
+        #if $warning.set_options == 'yes':
+            --warn_dist $warning.warn_dist
+            --warn_sim $warning.warn_sim
+            --warn_aln $warning.warn_aln
+        #end if
+]]></token>
+
+   <xml name="select_outputs">
+   </xml>
+
+   <xml name="common_outputs">
+       <data name="log" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/logger.log"/>
+       <data name="warnings" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/warnings.txt"/>
+       <data name="Primary_clustering_dendrogram" format="pdf" label="${tool.name} on ${on_string}: Primary_clustering_dendrogram.pdf" from_work_dir="outdir/figures/Primary_clustering_dendrogram.pdf"/>
+       <data name="Secondary_clustering_dendrograms" format="pdf" label="${tool.name} on ${on_string}: Secondary_clustering_dendrograms.pdf" from_work_dir="outdir/figures/Secondary_clustering_dendrograms.pdf"/>
+       <data name="Secondary_clustering_MDS" format="pdf" label="${tool.name} on ${on_string}: Secondary_clustering_MDS.pdf" from_work_dir="outdir/figures/Secondary_clustering_MDS.pdf"/>
+       <data name="Clustering_scatterplots" format="pdf" label="${tool.name} on ${on_string}: Clustering_scatterplots.pdf" from_work_dir="outdir/figures/Clustering_scatterplots.pdf"/>
+    </xml>
+   <xml name="common_outputs2">
+    </xml>
+
+    <token name="@GENOMES_HELP@"><![CDATA[
+I/O PARAMETERS:
+  -g [GENOMES [GENOMES ...]], --genomes [GENOMES [GENOMES ...]]
+                        genomes to cluster in .fasta format (default: None)
+]]></token>
+
+    <token name="@FILTERING_HELP@"><![CDATA[
+FILTERING OPTIONS:
+  -l LENGTH, --length LENGTH
+                        Minimum genome length (default: 50000)
+  -comp COMPLETENESS, --completeness COMPLETENESS
+                        Minumum genome completeness (default: 75)
+  -con CONTAMINATION, --contamination CONTAMINATION
+                        Maximum genome contamination (default: 25)
+  --ignoreGenomeQuality
+                        Don't run checkM or do any quality filtering. NOT
+                        RECOMMENDED! This is useful for use with
+                        bacteriophages or eukaryotes or things where checkM
+                        scoring does not work. Will only choose genomes based
+                        on length and N50 (default: False)
+
+
+]]></token>
+
+    <token name="@GENOME_COMPARISON_HELP@"><![CDATA[
+GENOME COMPARISON PARAMETERS:
+  -ms MASH_SKETCH, --MASH_sketch MASH_SKETCH
+                        MASH sketch size (default: 1000)
+  --S_algorithm {goANI,ANIn,ANImf,gANI}
+                        Algorithm for secondary clustering comaprisons:
+                        ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions
+                        ANIn  = Align whole genomes with nucmer; compare aligned regions
+                        gANI  = Identify and align ORFs; compare aligned ORFS
+                         (default: ANImf)
+  -n_PRESET {normal,tight}
+                        Presets to pass to nucmer
+                        tight   = only align highly conserved regions
+                        normal  = default ANIn parameters (default: normal)
+
+]]></token>
+
+    <token name="@CLUSTERING_HELP@"><![CDATA[
+CLUSTERING PARAMETERS:
+  -pa P_ANI, --P_ani P_ANI
+                        ANI threshold to form primary (MASH) clusters
+                        (default: 0.9)
+  -sa S_ANI, --S_ani S_ANI
+                        ANI threshold to form secondary clusters (default:
+                        0.99)
+  --SkipMash            Skip MASH clustering, just do secondary clustering on
+                        all genomes (default: False)
+  --SkipSecondary       Skip secondary clustering, just perform MASH
+                        clustering (default: False)
+  -nc COV_THRESH, --cov_thresh COV_THRESH
+                        Minmum level of overlap between genomes when doing
+                        secondary comparisons (default: 0.1)
+  -cm {total,larger}, --coverage_method {total,larger}
+                        Method to calculate coverage of an alignment
+                        (for ANIn/ANImf only; gANI can only do larger method)
+                        total   = 2*(aligned length) / (sum of total genome lengths)
+                        larger  = max((aligned length / genome 1), (aligned_length / genome2))
+                         (default: larger)
+  --clusterAlg CLUSTERALG
+                        Algorithm used to cluster genomes (passed to
+                        scipy.cluster.hierarchy.linkage (default: average)
+
+]]></token>
+
+    <token name="@SCORING_HELP@"><![CDATA[
+SCORING CRITERIA
+Based off of the formula:
+A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size)
+
+A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight:
+  -comW COMPLETENESS_WEIGHT, --completeness_weight COMPLETENESS_WEIGHT
+                        completeness weight (default: 1)
+  -conW CONTAMINATION_WEIGHT, --contamination_weight CONTAMINATION_WEIGHT
+                        contamination weight (default: 5)
+  -strW STRAIN_HETEROGENEITY_WEIGHT, --strain_heterogeneity_weight STRAIN_HETEROGENEITY_WEIGHT
+                        strain heterogeneity weight (default: 1)
+  -N50W N50_WEIGHT, --N50_weight N50_WEIGHT
+                        weight of log(genome N50) (default: 0.5)
+  -sizeW SIZE_WEIGHT, --size_weight SIZE_WEIGHT
+                        weight of log(genome size) (default: 0)
+
+]]></token>
+
+    <token name="@TAXONOMY_HELP@"><![CDATA[
+TAXONOMY:
+  --run_tax             generate taxonomy information (Tdb) (default: False)
+  --tax_method {percent,max}
+                        Method of determining taxonomy
+                        percent = The most descriptive taxonimic level with at least (per) hits
+                        max     = The centrifuge taxonomic level with the most overall hits (default: percent)
+  -per PERCENT, --percent PERCENT
+                        minimum percent for percent method (default: 50)
+  --cent_index CENT_INDEX
+                        path to centrifuge index (for example,
+                        /home/mattolm/download/centrifuge/indices/b+h+v
+                        (default: None)
+
+]]></token>
+
+    <token name="@WARNINGS_HELP@"><![CDATA[
+WARNINGS:
+  --warn_dist WARN_DIST
+                        How far from the threshold to throw cluster warnings
+                        (default: 0.25)
+  --warn_sim WARN_SIM   Similarity threshold for warnings between dereplicated
+                        genomes (default: 0.98)
+  --warn_aln WARN_ALN   Minimum aligned fraction for warnings between
+                        dereplicated genomes (ANIn) (default: 0.25)
+
+]]></token>
+
+
+</macros>