Mercurial > repos > iuc > coverm_genome
view coverm_genome.xml @ 2:5bf5792b0996 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tools/coverm commit 877d09f04bae57678daa94766fd31d0d0f1b6a37
author | iuc |
---|---|
date | Tue, 28 Mar 2023 08:36:25 +0000 |
parents | a671907f96fe |
children | bb3f59096c8e |
line wrap: on
line source
<tool id="coverm_genome" name="CoverM genome" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> <description>Calculate coverage of individual genomes</description> <macros> <import>macros.xml</import> </macros> <expand macro="requirements"/> <command><![CDATA[ #if $reads.read_type == "single" or $reads.read_type == "interleaved" mkdir -p reads1 && #set file_paths1 = [] #for $input_file in $reads.single #set $fname = $input_file.element_identifier.replace(" ","_") #set $file_path = 'reads1/' + $fname ln -s '$input_file' '$file_path' && $file_paths1.append($file_path) #end for #else if $reads.read_type == "bam" mkdir -p bam && #set bam_files = [] #for $input_file in $reads.bam #set $fname = $input_file.element_identifier.replace(" ","_") #set $file_path = 'bam/' + $fname ln -s '$input_file' '$file_path' && $bam_files.append($file_path) #end for #else if $reads.read_type == "paired" mkdir -p paired_reads1 && #set fw_reads1 = [] #for $input_file in $reads.read1 #set $fname = $input_file.element_identifier.replace(" ","_") #set $file_path = 'paired_reads1/' + str($fname) ln -s '$input_file' '$file_path' && $fw_reads1.append($file_path) #end for #set rv_reads1 = [] #for $input_file in $reads.read2 #set $fname = $input_file.element_identifier.replace(" ","_") #set $file_path = 'paired_reads1/' + str($fname) ln -s '$input_file' '$file_path' && $rv_reads1.append($file_path) #end for #silent $fw_reads1.sort() #silent $rv_reads1.sort() #else mkdir -p paired_reads && #set paired_reads1 = [] #for $i, $input_file in enumerate($reads.paired_reads) #set $file_path = 'paired_reads/fw' + str($i) ln -s '$input_file.forward' '$file_path' && $paired_reads1.append($file_path) #set $file_path = 'paired_reads/rv' + str($i) ln -s '$input_file.reverse' '$file_path' && $paired_reads1.append($file_path) #end for #end if #if $add_reads.extra_read.read_type == "single" or $add_reads.extra_read.read_type == "interleaved" mkdir -p add_reads1 && #set add_file_paths1 = [] #for $input_file in $add_reads.extra_read.single #set $fname = $input_file.element_identifier.replace(" ","_") #set $file_path = 'add_reads1/' + $fname ln -s '$input_file' '$file_path' && $add_file_paths1.append($file_path) #end for #else if $add_reads.extra_read.read_type == "bam" mkdir -p add_bam && #set add_bam_files = [] #for $input_file in $reads.bam #set $fname = $input_file.element_identifier.replace(" ","_") #set $file_path = 'add_bam/' + $fname ln -s '$input_file' '$file_path' && $add_bam_files.append($file_path) #end for #else if $add_reads.extra_read.read_type == "paired" mkdir -p add_paired_reads1 && #set add_fw_reads1 = [] #for $input_file in $add_reads.extra_read.read1 #set $fname = $input_file.element_identifier.replace(" ","_") #set $file_path = 'add_paired_reads1/' + str($fname) ln -s '$input_file' '$file_path' && $add_fw_reads1.append($file_path) #end for #set add_rv_reads1 = [] #for $input_file in $add_reads.extra_read.read2 #set $fname = $input_file.element_identifier.replace(" ","_") #set $file_path = 'add_paired_reads1/' + str($fname) ln -s '$input_file' '$file_path' && $add_rv_reads1.append($file_path) #end for #silent $add_fw_reads1.sort() #silent $add_rv_reads1.sort() #else if $add_reads.extra_read.read_type == "paired_collection" mkdir -p add_paired_reads && #set add_paired_reads1 = [] #for $i, $input_file in enumerate($add_reads.extra_read.paired_reads) #set $ext = $input_file.forward.ext #set $file_path = 'add_paired_reads/fw' + str($i) + '.' + $ext ln -s '$input_file.forward' '$file_path' && $add_paired_reads1.append($file_path) #set $file_path = 'add_paired_reads/rv' + str($i) + '.' + $ext ln -s '$input_file.reverse' '$file_path' && $add_paired_reads1.append($file_path) #end for #end if #if $reads.genome.ref_or_genome == "genomic" mkdir -p genomes && #set genome_files = [] #if $reads.genome.genomic.source == "history" #for $input_file in $reads.genome.genomic.fasta_history #set $fname = $input_file.element_identifier.replace(" ","_") #set $file_path = 'genomes/' + $fname ln -s '$input_file' '$file_path' && $genome_files.append($file_path) #end for #else #for $input_file in $reads.genome.genomic.fasta_builtin #set $ext = $input_file.fields.path.ext #set $fname = $input_file.fields.path.element_identifier.replace(" ","_") #set $file_path = 'genomes/' + $fname ln -s '$input_file' '$file_path' && $genome_files.append($file_path) #end for #end if #else if $reads.genome.ref_or_genome != "none" mkdir -p reference && #set ref_files = [] #for $input_file in $reads.genome.ref_source #set $fname = $input_file.element_identifier.replace(" ","_") #set $file_path = 'reference/' + $fname ln -s '$input_file' '$file_path' && $ref_files.append($file_path) #end for #if $reads.genome.add_genome.add_genome == "true" mkdir -p genomes && #set genome_files = [] #if $reads.genome.add_genome.add_genomic == "history" #for $input_file in $reads.genome.add_genome.add_genomic.fasta_history #set $fname = $input_file.element_identifier.replace(" ","_") #set $file_path = 'genomes/' + $fname ln -s '$input_file' '$file_path' && $genome_files.append($file_path) #end for #else #for $input_file in $reads.genome.add_genome.add_genomic.fasta_builtin #set $ext = $input_file.fields.path.ext #set $fname = $input_file.fields.path.element_identifier.replace(" ","_") #set $file_path = 'genomes/' + $fname ln -s '$input_file' '$file_path' && $genome_files.append($file_path) #end for #end if #end if #end if mkdir ./representative-fasta/ && coverm genome #if $reads.read_type == 'paired' -1 #for $read in $fw_reads1 '${read}' #end for -2 #for $read in $rv_reads1 '${read}' #end for #else if $reads.read_type == 'paired_collection' --coupled #for $read in $paired_reads1 '${read}' #end for #else if $reads.read_type == 'single' --single #for $read in $file_paths1 '${read}' #end for #else if $reads.read_type == 'interleaved' --interleaved #for $read in $file_paths1 '${read}' #end for #else if $reads.read_type == 'bam' -b #for $read in $bam_files '${read}' #end for #end if #if $add_reads.extra_read.read_type == 'paired' -1 #for $read in $add_fw_reads1 '${read}' #end for -2 #for $read in $add_rv_reads1 '${read}' #end for #else if $add_reads.extra_read.read_type == 'paired_collection' --coupled #for $read in $add_paired_reads1 '${read}' #end for #else if $add_reads.extra_read.read_type == 'single' --single #for $read in $add_file_paths1 '${read}' #end for #else if $add_reads.extra_read.read_type == 'interleaved' --interleaved #for $read in $add_file_paths1 '${read}' #end for #else if $add_reads.extra_read.read_type == 'bam' #for $read in $add_bam_files '${read}' #end for #end if #if $reads.genome.ref_or_genome == "genomic": -f #for $genome in $genome_files '${genome}' #end for #else if $reads.genome.ref_or_genome != "none" and $reads.genome.ref_or_genome != "genomic": --reference #for $reference in $ref_files '${reference}' #end for #if $reads.genome.cond_single_genome.single_genome != "false" $reads.genome.cond_single_genome.single_genome #if $reads.genome.cond_single_genome.genome_definition --genome-definition $reads.genome.cond_single_genome.genome_definition #end if #else #if $reads.genome.cond_single_genome.separator -s "$reads.genome.cond_single_genome.separator" #end if #end if #if $reads.genome.add_genome.add_genome == "true" -f #for $genome in $genome_files '${genome}' #end for #end if #else if $reads.genome.ref_or_genome == "none": #if $reads.genome.separator: -s "$reads.genome.separator" #end if $reads.genome.single_genome #end if $derep.dereplicate #if $derep.checkm_tab_table: --chekm-tab-table '$derep.checkm_tab_table' #end if #if $derep.genome_info: --genome-info '$derep.genome_info' #end if #if $derep.min_completeness != "": --min-completeness $derep.min_completeness #end if #if $derep.max_contamination != "": --max-contamination $derep.max_contamination #end if #if $derep.dereplication_ani != "": --dereplication-ani $derep.dereplication_ani #end if #if $derep.dereplication_aligned_fraction != "": --dereplication-aligned-fraction $derep.dereplication_aligned_fraction #end if #if $derep.dereplication_fragment_length != "": --dereplication-fragment-length $derep.dereplication_fragment_length #end if #if $derep.dereplication_prethreshold_ani != "": --dereplication-prethreshold-ani $derep.dereplication_prethreshold_ani #end if #if $derep.dereplication_quality_formula: --dereplication-quality-formula $derep.dereplication_quality_formula #end if #if $derep.dereplication_precluster_method: --dereplication-precluster-method $derep.dereplication_precluster_method #end if $shar.sharded $shar.exclude_genomes_from_deshard #if $mapping.mapper: --mapper $mapping.mapper #end if #if $mapping.min_read_aligned_length: --min-read-aligned-length $mapping.min_read_aligned_length #end if #if $mapping.min_read_percent_identity: --min-read-percent-identity $mapping.min_read_percent_identity #end if #if $mapping.min_read_aligned_percent: --min-read-aligned-percent $mapping.min_read_aligned_percent #end if #if $mapping.min_read_aligned_length_pair: --min-read-aligned-length-pair $mapping.min_read_aligned_length_pair #end if #if $mapping.min_read_percent_identity_pair: --min-read-percent-identity-pair $mapping.min_read_percent_identity_pair #end if #if $mapping.min_read_aligned_percent_pair: --min-read-aligned-percent-pair $mapping.min_read_aligned_percent_pair #end if $mapping.proper_pairs_only $mapping.exclude_supplementary --methods $cov.relative_abundance $cov.mean $cov.cond_methods.trimmed_mean $cov.covered_bases $cov.covered_fraction $cov.variance $cov.length $cov.count $cov.metabat $cov.coverage_histogram $cov.reads_per_base $cov.rpkm $cov.tpm #if $cov.min_covered_fraction != "": --min-covered-fraction $cov.min_covered_fraction #end if #if $cov.contig_end_exclusion != "": --contig-end-exclusion $cov.contig_end_exclusion #end if #if $cov.cond_methods.trimmed_mean == "trimmed_mean" #if $cov.cond_methods.trim_min: --trim-min $cov.cond_methods.trim_min #end if #if $cov.cond_methods.trim_max: --trim_max $cov.cond_methods.trim_max #end if #end if #if $out.output_format: --output-format $out.output_format #end if #if $out.dereplication_output_cluster_definition: --dereplication-output-cluster-definition '$cluster_definition' #end if #if $out.dereplication_output_representative_fasta_directory_copy: --dereplication-output-representative-fasta-directory-copy ./representative-fasta/ #end if $out.no_zeros --output-file output.tsv --threads \${GALAXY_SLOTS:-1} #if $derep.dereplicate and $out.dereplication_output_cluster_definition && sed -i -e 's@genomes/@@g; s/\.fna//g' '$cluster_definition' #end if ]]></command> <inputs> <expand macro="reads" /> <expand macro="add_reads" /> <section name="derep" title="Dereplication options" expanded="false"> <param argument="--dereplicate" type="boolean" truevalue="--dereplicate" falsevalue="" optional="true" label="Do genome dereplication via average nucleotide identity (ANI)" help="Choose a genome to represent all within a small distance, using Dashing for preclustering and FastANI for final ANI calculation. When this flag is used, dereplication occurs transparently through the Galah method."/> <param argument="--checkm-tab-table" type="data" format="tsv" optional="true" label="Use checkM table" help="It is used for defining genome quality, which is used both for filtering and to rank genomes during clustering"/> <param argument="--genome-info" type="data" format="csv" optional="true" label="Define quality in dRep style" help="dRep stype genome info table for defining quality" /> <param argument="--min-completeness" type="float" optional="true" min="0" max="100" label="Min completeness" help="Ignore genomes with less completeness than this percentage" /> <param argument="--max-contamination" type="float" optional="true" min="0" max="100" label="Max contamination" help="Ignore genomes with more contamination than this percentage" /> <param argument="--dereplication-ani" type="float" optional="true" min="0" max="100" value="99" label="Overall ANI level" help="Overall ANI level to dereplicate at with FastANI. Default: 99" /> <param argument="--dereplication-aligned-fraction" type="float" optional="true" min="0" value="50" label="Dereplication aligned fraction" help="Min aligned fraction of two genomes for clustering. Default: 50" /> <param argument="--dereplication-fragment-length" type="integer" optional="true" min="0" value="3000" label="Length of fragment used in FastANI calculation (i.e. --fragLen). Default: 3000" /> <param argument="--dereplication-prethreshold-ani" type="float" optional="true" min="0" max="100" value="95" label="Dereplication preclustering threshold" help="Require at least this dashing-derived ANI for preclustering and to avoid FastANI on distant lineages within preclusters. Default: 95" /> <param type="select" argument="--dereplication-quality-formula" label="Scoring function for genome quality" help="Default: Parks2020_reduced"> <option value="Parks2020_reduced">Parks2020_reduced (default)</option> <option value="completeness-4contamination">completeness-4contamination</option> <option value="completeness-5contamination">completeness-5contamination</option> <option value="dRep">dRep</option> </param> <param type="select" argument="--dereplication-precluster-method" label="Dereplication precluster method" help="Method of calculating rough ANI for dereplication. Default: dashing"> <option value="dashing">HyperLogLog</option> <option value="finch">finch MinHash</option> </param> </section> <section name="shar" title="Sharding options" expanded="false"> <param argument="--sharded" type="boolean" optional="true" truevalue="--sharded" falsevalue="" label="Shared" help="If -b/--bam-files was used: Input BAM files are read-sorted alignments of a set of reads mapped to multiple reference contig sets. Choose the best hit for each read pair. Otherwise if mapping was carried out: Map reads to each reference, choosing the best hit for each pair." /> <param argument="--exclude-genomes-from-deshard" type="boolean" optional="true" truevalue="--exclude-genomes-from-deshard" falsevalue="" label="Exclude genomes from deshard" help="Ignore genomes whose name appears in this newline-separated file when combining shards." /> </section> <expand macro="mapping"/> <expand macro="coverage"/> <expand macro="out"/> </inputs> <outputs> <data name="output1" format="tsv" from_work_dir="./output.tsv"/> <data name="cluster_definition" format="tsv" label="${tool.name} on ${on_string}: cluster definition"> <filter>derep['dereplicate'] and out['dereplication_output_cluster_definition']</filter> </data> <collection name="representative_fasta" type="list" label="${tool.name} on ${on_string}: representative fasta"> <discover_datasets pattern="(?P<designation>.*)\.fna" format="fasta" directory="representative-fasta" /> <filter>derep['dereplicate'] and out['dereplication_output_representative_fasta_directory_copy']</filter> </collection> </outputs> <tests> <test expect_num_outputs="1"> <conditional name="reads"> <param name="read_type" value="paired_collection"/> <param name="paired_reads"> <collection type="list:paired"> <element name="reads_for_seq1_and_seq2..fq"> <collection type="paired"> <element name="forward" value="reads_for_seq1_and_seq2.1.fq.gz"/> <element name="reverse" value="reads_for_seq1_and_seq2.2.fq.gz"/> </collection> </element> </collection> </param> <conditional name="genome"> <param name="ref_or_genome" value="reference"/> <param name="ref_source" value="7seqs.fna"/> <conditional name="cond_single_genome"> <param name="single_genome" value="false"/> <param name="separator" value="~"/> </conditional> </conditional> </conditional> <section name="add_reads"> <conditional name="extra_read"> <param name="read_type" value="single"/> <param name="single" value="reads_for_seq1_and_seq2.fna"/> </conditional> </section> <section name="cov"> <param name="mean" value="true"/> <param name="relative_abundance" value="true"/> <param name="variance" value="true"/> </section> <section name="out"> <param name="output_format" value="sparse"/> </section> <output name="output1" file="test1.tsv" ftype="tsv" sort="true"/> </test> <test expect_num_outputs="1"> <conditional name="reads"> <param name="read_type" value="bam"/> <param name="bam" value="7seqs.reads_for_seq1_and_seq2.bam"/> <conditional name="genome"> <param name="ref_or_genome" value="none"/> <param name="separator" value="~"/> </conditional> </conditional> <section name="shar"> <param name="sharded" value="false"/> <param name="exclude_genoms_from_deshard" value="false"/> </section> <section name="mapping"> <param name="mapper" value="minimap2-sr"/> <param name="minimap2_reference_is_index" value="false"/> </section> <section name="cov"> <param name="mean" value="true"/> <param name="relative_abundance" value="true"/> </section> <section name="out"> <param name="output_format" value="sparse"/> </section> <output name="output1" file="test2.tsv" ftype="tsv" sort="true"/> </test> <test expect_num_outputs="1"> <section name="derep"> <param name="dereplicate" value="true"/> <param name="genome_info" value="genomeInfo.csv"/> </section> <section name="cov"> <param name="covered_fraction" value="true"/> <param name="min_covered_fraction" value="0"/> </section> <conditional name="reads"> <param name="read_type" value="single"/> <param name="single" value="1read.actually_fasta.fq"/> <conditional name="genome"> <param name="ref_or_genome" value="genomic"/> <conditional name="genomic"> <param name="source" value="history"/> <param name="fasta_history" value="500kb.fna,1mbp.fna"/> </conditional> </conditional> </conditional> <output name="output1"> <assert_contents> <has_n_lines n="2"/> </assert_contents> </output> </test> <test expect_num_outputs="3"> <conditional name="reads"> <param name="read_type" value="paired_collection"/> <param name="paired_reads"> <collection type="list:paired"> <element name="reads_for_genome2"> <collection type="paired"> <element name="forward" value="reads_for_genome2.1.fa"/> <element name="reverse" value="reads_for_genome2.2.fa"/> </collection> </element> </collection> </param> <conditional name="genome"> <param name="ref_or_genome" value="genomic"/> <conditional name="genomic"> <param name="source" value="history"/> <param name="fasta_history" value="genome1.fna,genome2.fna,genome3.fna"/> </conditional> </conditional> </conditional> <section name="cov"> <param name="mean" value="true"/> </section> <section name="derep"> <param name="dereplicate" value="true"/> </section> <section name="out"> <param name="dereplication_output_cluster_definition" value="true"/> <param name="dereplication_output_representative_fasta_directory_copy" value="true"/> </section> <output name="output1" file="test3.tsv" ftype="tsv" sort="true"/> <output name="cluster_definition" ftype="tsv" value="test4_cluster.tsv" sort="true"/> <output_collection name="representative_fasta" type="list" count="3"> <element name="genome1" file="test4_rep1.fa" ftype="fasta" /> <element name="genome2" file="test4_rep2.fa" ftype="fasta" /> <element name="genome3" file="test4_rep3.fa" ftype="fasta" /> </output_collection> </test> <test expect_num_outputs="1"> <conditional name="reads"> <param name="read_type" value="bam"/> <param name="bam" value="2seqs.bad_read.1.with_supplementary.bam"/> <conditional name="genome"> <param name="ref_or_genome" value="none"/> <param name="single_genome" value="true"/> </conditional> </conditional> <section name="cov"> <param name="count" value="true"/> <param name="min_covered_fraction" value="0"/> </section> <output name="output1" file="test5.tsv" ftype="tsv" sort="true"/> </test> </tests> <help><![CDATA[ .. class:: infomark **Dereplication quality formula** Scoring function for genome quality: - Parks2020_reduced: A quality formula described in `Parks et. al. 2020 <https://doi.org/10.1038/s41587-020-0501-8>`_ (Supplementary Table 19) but only including those scoring criteria that can be calculated from the sequence without homology searching: *completeness-5*contamination-5*num_contigs/100-5*num_ambiguous_bases/100000*. - completeness-4contamination: *completeness-4*contamination* - completeness-5contamination: *completeness-5*contamination* - dRep: *completeness-5*contamination+contamination*(strain_heterogeneity/100)+0.5*log10(N50)* ----- .. class:: infomark **Method for calculating coverage** Calculation of genome-wise coverage (genome mode) is similar to calculating contig-wise (contig mode) coverage, except that the unit of reporting is per-genome rather than per-contig. For calculation methods which exclude base positions based on their coverage, all positions from all contigs are considered together. - Relative abundance: Percentage relative abundance of each genome, and the unmapped read percentage - Mean: Average number of aligned reads overlapping each position on the genome - Trimmed mean: Average number of aligned reads overlapping each position after removing the most deeply and shallowly covered positions. - Covered fraction: Fraction of genome covered by 1 or more reads - Covered bases: Number of bases covered by 1 or more reads - Variance: Variance of coverage depths - Length: Length of each genome in base pairs - Count: Number of reads aligned toq each genome. Note that a single read may be aligned to multiple genomes with supplementary alignments - Reads per base: Number of reads aligned divided by the length of the genome - MetaBAT: Reproduction of the `MetaBAT <https://bitbucket.org/berkeleylab/metabat>`_ tool output - Coverage histogram: Histogram of coverage depths - RPKM: Reads mapped per kilobase of genome, per million mapped reads - TPM: Transcripts Per Million as described in `Li et al 2010 <https://doi.org/10.1093/bioinformatics/btp692>`_ ]]></help> <expand macro="citation"/> </tool>