view coverm_genome.xml @ 1:a671907f96fe draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/coverm commit 2388df7187533fb66b7729730340d2eb7b93c112
author iuc
date Tue, 24 Jan 2023 12:32:54 +0000
parents 134e4e78e754
children 5bf5792b0996
line wrap: on
line source

<tool id="coverm_genome" name="CoverM genome" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Calculate coverage of individual genomes</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <command><![CDATA[
         #if $reads.read_type == "single" or $reads.read_type == "interleaved"
            mkdir -p reads1 &&
            #set file_paths1 = []
            #for $input_file in $reads.single
                
                #set $fname = $input_file.element_identifier.replace(" ","_")
                #set $file_path = 'reads1/' + $fname
                ln -s '$input_file' '$file_path' &&
                $file_paths1.append($file_path)
            #end for
        #else if $reads.read_type == "bam"
            mkdir -p bam &&
            #set bam_files = []
            #for $input_file in $reads.bam
                
                #set $fname = $input_file.element_identifier.replace(" ","_")
                #set $file_path = 'bam/' + $fname
                ln -s '$input_file' '$file_path' &&
                $bam_files.append($file_path)
            #end for
        #else if $reads.read_type == "paired"
            mkdir -p paired_reads1 &&
            #set fw_reads1 = []
            #for $input_file in $reads.read1
                
                #set $fname = $input_file.element_identifier.replace(" ","_")
                #set $file_path = 'paired_reads1/' + str($fname)
                ln -s '$input_file' '$file_path' &&
                $fw_reads1.append($file_path)
            #end for
            #set rv_reads1 = []
            #for $input_file in $reads.read2
                
                #set $fname = $input_file.element_identifier.replace(" ","_")
                #set $file_path = 'paired_reads1/' + str($fname)
                ln -s '$input_file' '$file_path' &&
                $rv_reads1.append($file_path)
            #end for
            #silent $fw_reads1.sort()
            #silent $rv_reads1.sort()
        #else
            mkdir -p paired_reads &&
            #set paired_reads1 = []
            #for $i, $input_file in enumerate($reads.paired_reads)
                #set $file_path = 'paired_reads/fw' + str($i)
                ln -s '$input_file.forward' '$file_path' &&
                $paired_reads1.append($file_path)
                #set $file_path = 'paired_reads/rv' + str($i)
                ln -s '$input_file.reverse' '$file_path' &&
                $paired_reads1.append($file_path)
            #end for
        #end if           
        #if $add_reads.extra_read.read_type == "single" or $add_reads.extra_read.read_type == "interleaved"
            mkdir -p add_reads1 &&
            #set add_file_paths1 = []
            #for $input_file in $add_reads.extra_read.single
                #set $fname = $input_file.element_identifier.replace(" ","_")
                #set $file_path = 'add_reads1/' + $fname
                ln -s '$input_file' '$file_path' &&
                $add_file_paths1.append($file_path)
            #end for
        #else if $add_reads.extra_read.read_type == "bam"
            mkdir -p add_bam &&
            #set add_bam_files = []
            #for $input_file in $reads.bam
                #set $fname = $input_file.element_identifier.replace(" ","_")
                #set $file_path = 'add_bam/' + $fname
                ln -s '$input_file' '$file_path' &&
                $add_bam_files.append($file_path)
            #end for
        #else if $add_reads.extra_read.read_type == "paired"
            mkdir -p add_paired_reads1 &&
            #set add_fw_reads1 = []
            #for $input_file in $add_reads.extra_read.read1
                #set $fname = $input_file.element_identifier.replace(" ","_")
                #set $file_path = 'add_paired_reads1/' + str($fname)
                ln -s '$input_file' '$file_path' &&
                $add_fw_reads1.append($file_path)
            #end for
            #set add_rv_reads1 = []
            #for $input_file in $add_reads.extra_read.read2
                #set $fname = $input_file.element_identifier.replace(" ","_")
                #set $file_path = 'add_paired_reads1/' + str($fname)
                ln -s '$input_file' '$file_path' &&
                $add_rv_reads1.append($file_path)
            #end for
            #silent $add_fw_reads1.sort()
            #silent $add_rv_reads1.sort()
        #else if $add_reads.extra_read.read_type == "paired_collection"
            mkdir -p add_paired_reads &&
            #set add_paired_reads1 = []
            #for $i, $input_file in enumerate($add_reads.extra_read.paired_reads)
                #set $ext = $input_file.forward.ext
                #set $file_path = 'add_paired_reads/fw' + str($i) + '.' + $ext
                ln -s '$input_file.forward' '$file_path' &&
                $add_paired_reads1.append($file_path)
                #set $file_path = 'add_paired_reads/rv' + str($i) + '.' + $ext
                ln -s '$input_file.reverse' '$file_path' &&
                $add_paired_reads1.append($file_path)
            #end for
        #end if
        #if $reads.genome.ref_or_genome == "genomic"
                mkdir -p genomes &&
                #set genome_files = []
            #if $reads.genome.genomic.source == "history"
                #for $input_file in $reads.genome.genomic.fasta_history
                    
                    #set $fname = $input_file.element_identifier.replace(" ","_")
                    #set $file_path = 'genomes/' + $fname
                    ln -s '$input_file' '$file_path' &&
                    $genome_files.append($file_path)
                #end for
             #else
                #for $input_file in $reads.genome.genomic.fasta_builtin
                    #set $ext = $input_file.fields.path.ext
                    #set $fname = $input_file.fields.path.element_identifier.replace(" ","_")
                    #set $file_path = 'genomes/' + $fname
                    ln -s '$input_file' '$file_path' &&
                    $genome_files.append($file_path)
                #end for
            #end if
        #else if $reads.genome.ref_or_genome != "none"
            mkdir -p reference &&
            #set ref_files = []
                #for $input_file in $reads.genome.ref_source
                    #set $fname = $input_file.element_identifier.replace(" ","_")
                    #set $file_path = 'reference/' + $fname
                    ln -s '$input_file' '$file_path' &&
                    $ref_files.append($file_path)
                #end for
            #if $reads.genome.add_genome.add_genome == "true"
                mkdir -p genomes &&
                #set genome_files = []
                #if $reads.genome.add_genome.add_genomic == "history"
                    #for $input_file in $reads.genome.add_genome.add_genomic.fasta_history
                        #set $fname = $input_file.element_identifier.replace(" ","_")
                        #set $file_path = 'genomes/' + $fname
                        ln -s '$input_file' '$file_path' &&
                        $genome_files.append($file_path)
                    #end for
                #else
                    #for $input_file in $reads.genome.add_genome.add_genomic.fasta_builtin
                        #set $ext = $input_file.fields.path.ext
                        #set $fname = $input_file.fields.path.element_identifier.replace(" ","_")
                        #set $file_path = 'genomes/' + $fname
                        ln -s '$input_file' '$file_path' &&
                        $genome_files.append($file_path)
                    #end for
                #end if
            #end if
        #end if
        
        mkdir ./representative-fasta/ &&
        coverm genome
            #if $reads.read_type == 'paired'
                -1
                #for $read in $fw_reads1
                    '${read}'
                #end for
                -2
                #for $read in $rv_reads1
                    '${read}'
                #end for
            #else if $reads.read_type == 'paired_collection'
                --coupled
                #for $read in $paired_reads1
                    '${read}'
                #end for
            #else if $reads.read_type == 'single'
                --single
                #for $read in $file_paths1
                    '${read}'
                #end for
            #else if $reads.read_type == 'interleaved'
                --interleaved
                #for $read in $file_paths1
                    '${read}'
                #end for
            #else if $reads.read_type == 'bam'
                -b
                #for $read in $bam_files
                    '${read}'
                #end for
            #end if

            #if $add_reads.extra_read.read_type == 'paired'
                -1
                #for $read in $add_fw_reads1
                    '${read}'
                #end for
                -2
                #for $read in $add_rv_reads1
                    '${read}'
                #end for
            #else if $add_reads.extra_read.read_type == 'paired_collection'
                --coupled
                #for $read in $add_paired_reads1
                    '${read}'
                #end for
            #else if $add_reads.extra_read.read_type == 'single'
                --single
                #for $read in $add_file_paths1
                    '${read}'
                #end for
            #else if $add_reads.extra_read.read_type == 'interleaved'
                --interleaved
                #for $read in $add_file_paths1
                    '${read}'
                #end for
            #else if $add_reads.extra_read.read_type == 'bam'
                #for $read in $add_bam_files
                    '${read}'
                #end for
            #end if
            

            #if $reads.genome.ref_or_genome == "genomic":
                -f
                #for $genome in $genome_files
                    '${genome}'
                #end for
            #else if $reads.genome.ref_or_genome != "none" and $reads.genome.ref_or_genome != "genomic":
                --reference
                #for $reference in $ref_files
                    '${reference}'
                #end for
                #if $reads.genome.cond_single_genome.single_genome != "false"
                    $reads.genome.cond_single_genome.single_genome
                    #if $reads.genome.cond_single_genome.genome_definition
                        --genome-definition $reads.genome.cond_single_genome.genome_definition
                    #end if
                #else
                    #if $reads.genome.cond_single_genome.separator
                        -s "$reads.genome.cond_single_genome.separator"
                    #end if
                #end if
                #if $reads.genome.add_genome.add_genome == "true"
                    -f
                    #for $genome in $genome_files
                        '${genome}'
                    #end for
                #end if
            #else if $reads.genome.ref_or_genome == "none":
                #if $reads.genome.separator:
                    -s "$reads.genome.separator"
                #end if
                $reads.genome.single_genome
            #end if

            $derep.dereplicate
            #if $derep.checkm_tab_table:
                --chekm-tab-table '$derep.checkm_tab_table'
            #end if
            #if $derep.genome_info:
                --genome-info '$derep.genome_info'
            #end if
            #if $derep.min_completeness != "":
                --min-completeness $derep.min_completeness
            #end if
            #if $derep.max_contamination != "":
                --max-contamination $derep.max_contamination
            #end if
            #if $derep.dereplication_ani != "":
                --dereplication-ani $derep.dereplication_ani
            #end if
            #if $derep.dereplication_aligned_fraction != "":
                --dereplication-aligned-fraction $derep.dereplication_aligned_fraction
            #end if
            #if $derep.dereplication_fragment_length != "":
                --dereplication-fragment-length $derep.dereplication_fragment_length
            #end if
            #if $derep.dereplication_prethreshold_ani != "":
                --dereplication-prethreshold-ani $derep.dereplication_prethreshold_ani
            #end if
            #if $derep.dereplication_quality_formula:
                --dereplication-quality-formula $derep.dereplication_quality_formula
            #end if
            #if $derep.dereplication_precluster_method:
                --dereplication-precluster-method $derep.dereplication_precluster_method
            #end if

            $shar.sharded
            $shar.exclude_genomes_from_deshard

            #if $mapping.mapper:
                --mapper $mapping.mapper
            #end if
            #if $mapping.min_read_aligned_length:
                --min-read-aligned-length $mapping.min_read_aligned_length
            #end if
            #if $mapping.min_read_percent_identity:
                --min-read-percent-identity $mapping.min_read_percent_identity
            #end if
            #if $mapping.min_read_aligned_percent:
                --min-read-aligned-percent $mapping.min_read_aligned_percent
            #end if
            #if $mapping.min_read_aligned_length_pair:
                --min-read-aligned-length-pair $mapping.min_read_aligned_length_pair
            #end if
            #if $mapping.min_read_percent_identity_pair:
                --min-read-percent-identity-pair $mapping.min_read_percent_identity_pair
            #end if
            #if $mapping.min_read_aligned_percent_pair:
                --min-read-aligned-percent-pair $mapping.min_read_aligned_percent_pair
            #end if
            $mapping.proper_pairs_only
            $mapping.exclude_supplementary

            --methods $cov.relative_abundance $cov.mean $cov.cond_methods.trimmed_mean $cov.covered_bases $cov.covered_fraction
            $cov.variance $cov.length $cov.count $cov.metabat $cov.coverage_histogram $cov.reads_per_base 
            $cov.rpkm $cov.tpm
            #if $cov.min_covered_fraction != "":
                --min-covered-fraction $cov.min_covered_fraction
            #end if
            #if $cov.contig_end_exclusion != "":
                --contig-end-exclusion $cov.contig_end_exclusion
            #end if
            #if $cov.cond_methods.trimmed_mean == "trimmed_mean"
                #if $cov.cond_methods.trim_min:
                    --trim-min $cov.cond_methods.trim_min
                #end if
                #if $cov.cond_methods.trim_max:
                    --trim_max $cov.cond_methods.trim_max
                #end if
            #end if

            #if $out.output_format:
                --output-format $out.output_format
            #end if
            #if $out.dereplication_output_cluster_definition:
                --dereplication-output-cluster-definition '$cluster_definition'
            #end if
            #if $out.dereplication_output_representative_fasta_directory_copy:
                --dereplication-output-representative-fasta-directory-copy ./representative-fasta/
            #end if
            $out.no_zeros
            --output-file output.tsv
            --threads \${GALAXY_SLOTS:-1}

        #if $derep.dereplicate and $out.dereplication_output_cluster_definition
            && sed -i -e 's@genomes/@@g; s/\.fna//g' '$cluster_definition'
        #end if
    ]]></command>
    <inputs>
        <expand macro="reads" />
        <expand macro="add_reads" />
        <section name="derep" title="Dereplication options" expanded="false">
            <param argument="--dereplicate" type="boolean" truevalue="--dereplicate" falsevalue="" optional="true" 
                label="Do genome dereplication via average nucleotide identity (ANI)" 
                help="Choose a genome to represent all within a small distance, using Dashing for 
                    preclustering and FastANI for final ANI calculation. When this flag is used, 
                    dereplication occurs transparently through the Galah method."/>
            <param argument="--checkm-tab-table" type="data" format="tsv" optional="true" 
                label="Use checkM table" help="It is used for defining genome quality, which is used both for filtering 
                    and to rank genomes during clustering"/>
            <param argument="--genome-info" type="data" format="csv" optional="true" 
                label="Define quality in dRep style" help="dRep stype genome info table for defining quality" />
            <param argument="--min-completeness" type="float" optional="true" min="0" max="100" 
                label="Min completeness" help="Ignore genomes with less completeness than this percentage" />
            <param argument="--max-contamination" type="float" optional="true" min="0" max="100" 
                label="Max contamination" help="Ignore genomes with more contamination than this percentage" />
            <param argument="--dereplication-ani" type="float" optional="true" min="0" max="100" value="99"
                label="Overall ANI level" help="Overall ANI level to dereplicate at with FastANI. Default: 99" />
            <param argument="--dereplication-aligned-fraction" type="float" optional="true" min="0" value="50"
                label="Dereplication aligned fraction" help="Min aligned fraction of two genomes for clustering. Default: 50" />
            <param argument="--dereplication-fragment-length" type="integer" optional="true" min="0" value="3000"
                label="Length of fragment used in FastANI calculation (i.e. --fragLen). Default: 3000" />
            <param argument="--dereplication-prethreshold-ani" type="float" optional="true" min="0" max="100" value="95"
                label="Dereplication preclustering threshold" help="Require at least this dashing-derived ANI for 
                    preclustering and to avoid FastANI on distant lineages within preclusters. Default: 95" />
            <param type="select" argument="--dereplication-quality-formula" label="Scoring function for genome quality" help="Default: Parks2020_reduced">
                <option value="Parks2020_reduced">Parks2020_reduced (default)</option>
                <option value="completeness-4contamination">completeness-4contamination</option>
                <option value="completeness-5contamination">completeness-5contamination</option>
                <option value="dRep">dRep</option>
            </param>
            <param type="select" argument="--dereplication-precluster-method" label="Dereplication precluster method" help="Method of calculating rough ANI for dereplication. Default: dashing">
                <option value="dashing">HyperLogLog</option>
                <option value="finch">finch MinHash</option>
            </param>
        </section>
        <section name="shar" title="Sharding options" expanded="false">
            <param argument="--sharded" type="boolean" optional="true" truevalue="--sharded" falsevalue="" label="Shared" help="If -b/--bam-files was used: Input BAM files are read-sorted alignments of a set of reads mapped to multiple reference contig sets. Choose the best hit for each read pair. Otherwise if mapping was carried out: Map reads to each reference, choosing the best hit for each pair." />
            <param argument="--exclude-genomes-from-deshard" type="boolean" optional="true" 
                truevalue="--exclude-genomes-from-deshard" falsevalue="" label="Exclude genomes from deshard" 
                help="Ignore genomes whose name appears in this newline-separated file when combining shards." />
        </section>
        <expand macro="mapping"/>
        <expand macro="coverage"/>
        <expand macro="out"/>
    </inputs>
    <outputs>
        <data name="output1" format="tsv" from_work_dir="./output.tsv"/>
        <data name="cluster_definition" format="tsv" label="${tool.name} on ${on_string}: cluster definition">
            <filter>derep['dereplicate'] and out['dereplication_output_cluster_definition']</filter>
        </data>
        <collection name="representative_fasta" type="list" label="${tool.name} on ${on_string}: representative fasta">
            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.fna" format="fasta" directory="representative-fasta" />
            <filter>derep['dereplicate'] and out['dereplication_output_representative_fasta_directory_copy']</filter>
        </collection>
    </outputs>
    <tests>
        <test expect_num_outputs="1">
            <conditional name="reads">
                <param name="read_type" value="paired_collection"/>
                <param name="paired_reads">
                    <collection type="list:paired">
                        <element name="reads_for_seq1_and_seq2..fq">
                            <collection type="paired">
                                <element name="forward" value="reads_for_seq1_and_seq2.1.fq.gz"/>
                                <element name="reverse" value="reads_for_seq1_and_seq2.2.fq.gz"/>
                            </collection>
                        </element>
                    </collection>
                </param>
                <conditional name="genome">
                    <param name="ref_or_genome" value="reference"/>
                        <param name="ref_source" value="7seqs.fna"/>
                        <conditional name="cond_single_genome">
                            <param name="single_genome" value="false"/>
                            <param name="separator" value="~"/>
                        </conditional>
                </conditional>
            </conditional>
            <section name="add_reads">
                <conditional name="extra_read">
                    <param name="read_type" value="single"/>
                    <param name="single" value="reads_for_seq1_and_seq2.fna"/>
                </conditional>
            </section>
            <section name="cov">
                <param name="mean" value="true"/>
                <param name="relative_abundance" value="true"/>
                <param name="variance" value="true"/>
            </section>
            <section name="out">
                <param name="output_format" value="sparse"/>
            </section>
            <output name="output1" file="test1.tsv" ftype="tsv"/>
        </test>
        <test expect_num_outputs="1">
            <conditional name="reads">
                <param name="read_type" value="bam"/>
                <param name="bam" value="7seqs.reads_for_seq1_and_seq2.bam"/>
                <conditional name="genome">
                    <param name="ref_or_genome" value="none"/>
                    <param name="separator" value="~"/>
                </conditional>
            </conditional>
            <section name="shar">
                <param name="sharded" value="false"/>
                <param name="exclude_genoms_from_deshard" value="false"/>
            </section>
            <section name="mapping">
                <param name="mapper" value="minimap2-sr"/>
                <param name="minimap2_reference_is_index" value="false"/>
            </section>
            <section name="cov">
                <param name="mean" value="true"/>
                <param name="relative_abundance" value="true"/>
            </section>
            <section name="out">
                <param name="output_format" value="sparse"/>
            </section>
            <output name="output1" file="test2.tsv" ftype="tsv"/>
        </test>
        <test expect_num_outputs="1">
            <section name="derep">
                <param name="dereplicate" value="true"/>
                <param name="genome_info" value="genomeInfo.csv"/>
            </section>
            <section name="cov">
                <param name="covered_fraction" value="true"/>
                <param name="min_covered_fraction" value="0"/>
            </section>
            <conditional name="reads">
                <param name="read_type" value="single"/>
                <param name="single" value="1read.actually_fasta.fq"/>
                <conditional name="genome">
                    <param name="ref_or_genome" value="genomic"/>
                    <conditional name="genomic">
                        <param name="source" value="history"/>
                        <param name="fasta_history" value="500kb.fna,1mbp.fna"/>
                    </conditional>
                </conditional>
            </conditional>
            <output name="output1">
                <assert_contents>
                    <has_n_lines n="2"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="3">
            <conditional name="reads">
                <param name="read_type" value="paired_collection"/>
                <param name="paired_reads">
                    <collection type="list:paired">
                        <element name="reads_for_genome2">
                            <collection type="paired">
                                <element name="forward" value="reads_for_genome2.1.fa"/>
                                <element name="reverse" value="reads_for_genome2.2.fa"/>
                            </collection>
                        </element>
                    </collection>
                </param>
                <conditional name="genome">
                <param name="ref_or_genome" value="genomic"/>
                    <conditional name="genomic">
                        <param name="source" value="history"/>
                        <param name="fasta_history" value="genome1.fna,genome2.fna,genome3.fna"/>
                    </conditional>
                </conditional>
            </conditional>
            <section name="cov">
                <param name="mean" value="true"/>
            </section>
            <section name="derep">
                <param name="dereplicate" value="true"/>
            </section>
            <section name="out">
                <param name="dereplication_output_cluster_definition" value="true"/>
                <param name="dereplication_output_representative_fasta_directory_copy" value="true"/>
            </section>
            <output name="output1" file="test3.tsv" ftype="tsv"/>
            <output name="cluster_definition" ftype="tsv" value="test4_cluster.tsv"/>
            <output_collection name="representative_fasta" type="list" count="3">
                <element name="genome1" file="test4_rep1.fa" ftype="fasta" />
                <element name="genome2" file="test4_rep2.fa" ftype="fasta" />
                <element name="genome3" file="test4_rep3.fa" ftype="fasta" />
            </output_collection>
        </test>
        <test expect_num_outputs="1">
            <conditional name="reads">
                <param name="read_type" value="bam"/>
                <param name="bam" value="2seqs.bad_read.1.with_supplementary.bam"/>
                <conditional name="genome">
                    <param name="ref_or_genome" value="none"/>
                    <param name="single_genome" value="true"/>
                </conditional>
            </conditional>
            <section name="cov">
                <param name="count" value="true"/>
                <param name="min_covered_fraction" value="0"/>
            </section>
            <output name="output1" file="test5.tsv" ftype="tsv"/>
        </test>
        </tests>
    <help><![CDATA[
.. class:: infomark
    
**Dereplication quality formula**
            
Scoring function for genome quality:

- Parks2020_reduced: A quality formula described in `Parks et. al. 2020 <https://doi.org/10.1038/s41587-020-0501-8>`_ (Supplementary Table 19) but only including those scoring criteria that can be calculated from the sequence without homology searching: *completeness-5*contamination-5*num_contigs/100-5*num_ambiguous_bases/100000*.
- completeness-4contamination: *completeness-4*contamination*
- completeness-5contamination: *completeness-5*contamination*
- dRep: *completeness-5*contamination+contamination*(strain_heterogeneity/100)+0.5*log10(N50)*

-----

.. class:: infomark
    
**Method for calculating coverage**

Calculation of genome-wise coverage (genome mode) is similar to calculating contig-wise (contig mode) coverage, except that the unit of reporting is per-genome rather than per-contig. For calculation methods which exclude base positions based on their coverage, all positions from all contigs are considered together. 

- Relative abundance: Percentage relative abundance of each genome, and the unmapped read percentage
- Mean: Average number of aligned reads overlapping each position on the genome
- Trimmed mean: Average number of aligned reads overlapping each position after removing the most deeply and shallowly covered positions.
- Covered fraction: Fraction of genome covered by 1 or more reads
- Covered bases: Number of bases covered by 1 or more reads
- Variance: Variance of coverage depths
- Length: Length of each genome in base pairs
- Count: Number of reads aligned toq each genome. Note that a single read may be aligned to multiple genomes with supplementary alignments
- Reads per base: Number of reads aligned divided by the length of the genome
- MetaBAT: Reproduction of the `MetaBAT <https://bitbucket.org/berkeleylab/metabat>`_ tool output
- Coverage histogram: Histogram of coverage depths
- RPKM: Reads mapped per kilobase of genome, per million mapped reads
- TPM: Transcripts Per Million as described in `Li et al 2010 <https://doi.org/10.1093/bioinformatics/btp692>`_

    ]]></help>
    <expand macro="citation"/>
</tool>